diff options
Diffstat (limited to 'fs')
217 files changed, 52322 insertions, 2870 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index f9b6e2979aaa..51307b0fdf0f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -269,6 +269,25 @@ config OCFS2_FS_POSIX_ACL | |||
269 | Posix Access Control Lists (ACLs) support permissions for users and | 269 | Posix Access Control Lists (ACLs) support permissions for users and |
270 | groups beyond the owner/group/world scheme. | 270 | groups beyond the owner/group/world scheme. |
271 | 271 | ||
272 | config BTRFS_FS | ||
273 | tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" | ||
274 | depends on EXPERIMENTAL | ||
275 | select LIBCRC32C | ||
276 | select ZLIB_INFLATE | ||
277 | select ZLIB_DEFLATE | ||
278 | help | ||
279 | Btrfs is a new filesystem with extents, writable snapshotting, | ||
280 | support for multiple devices and many more features. | ||
281 | |||
282 | Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET | ||
283 | FINALIZED. You should say N here unless you are interested in | ||
284 | testing Btrfs with non-critical data. | ||
285 | |||
286 | To compile this file system support as a module, choose M here. The | ||
287 | module will be called btrfs. | ||
288 | |||
289 | If unsure, say N. | ||
290 | |||
272 | endif # BLOCK | 291 | endif # BLOCK |
273 | 292 | ||
274 | source "fs/notify/Kconfig" | 293 | source "fs/notify/Kconfig" |
@@ -721,7 +740,20 @@ config CONFIGFS_FS | |||
721 | 740 | ||
722 | endmenu | 741 | endmenu |
723 | 742 | ||
724 | menu "Miscellaneous filesystems" | 743 | menuconfig MISC_FILESYSTEMS |
744 | bool "Miscellaneous filesystems" | ||
745 | default y | ||
746 | ---help--- | ||
747 | Say Y here to get to see options for various miscellaneous | ||
748 | filesystems, such as filesystems that came from other | ||
749 | operating systems. | ||
750 | |||
751 | This option alone does not add any kernel code. | ||
752 | |||
753 | If you say N, all options in this submenu will be skipped and | ||
754 | disabled; if unsure, say Y here. | ||
755 | |||
756 | if MISC_FILESYSTEMS | ||
725 | 757 | ||
726 | config ADFS_FS | 758 | config ADFS_FS |
727 | tristate "ADFS file system support (EXPERIMENTAL)" | 759 | tristate "ADFS file system support (EXPERIMENTAL)" |
@@ -900,6 +932,58 @@ config CRAMFS | |||
900 | 932 | ||
901 | If unsure, say N. | 933 | If unsure, say N. |
902 | 934 | ||
935 | config SQUASHFS | ||
936 | tristate "SquashFS 4.0 - Squashed file system support" | ||
937 | depends on BLOCK | ||
938 | select ZLIB_INFLATE | ||
939 | help | ||
940 | Saying Y here includes support for SquashFS 4.0 (a Compressed | ||
941 | Read-Only File System). Squashfs is a highly compressed read-only | ||
942 | filesystem for Linux. It uses zlib compression to compress both | ||
943 | files, inodes and directories. Inodes in the system are very small | ||
944 | and all blocks are packed to minimise data overhead. Block sizes | ||
945 | greater than 4K are supported up to a maximum of 1 Mbytes (default | ||
946 | block size 128K). SquashFS 4.0 supports 64 bit filesystems and files | ||
947 | (larger than 4GB), full uid/gid information, hard links and | ||
948 | timestamps. | ||
949 | |||
950 | Squashfs is intended for general read-only filesystem use, for | ||
951 | archival use (i.e. in cases where a .tar.gz file may be used), and in | ||
952 | embedded systems where low overhead is needed. Further information | ||
953 | and tools are available from http://squashfs.sourceforge.net. | ||
954 | |||
955 | If you want to compile this as a module ( = code which can be | ||
956 | inserted in and removed from the running kernel whenever you want), | ||
957 | say M here and read <file:Documentation/modules.txt>. The module | ||
958 | will be called squashfs. Note that the root file system (the one | ||
959 | containing the directory /) cannot be compiled as a module. | ||
960 | |||
961 | If unsure, say N. | ||
962 | |||
963 | config SQUASHFS_EMBEDDED | ||
964 | |||
965 | bool "Additional option for memory-constrained systems" | ||
966 | depends on SQUASHFS | ||
967 | default n | ||
968 | help | ||
969 | Saying Y here allows you to specify cache size. | ||
970 | |||
971 | If unsure, say N. | ||
972 | |||
973 | config SQUASHFS_FRAGMENT_CACHE_SIZE | ||
974 | int "Number of fragments cached" if SQUASHFS_EMBEDDED | ||
975 | depends on SQUASHFS | ||
976 | default "3" | ||
977 | help | ||
978 | By default SquashFS caches the last 3 fragments read from | ||
979 | the filesystem. Increasing this amount may mean SquashFS | ||
980 | has to re-read fragments less often from disk, at the expense | ||
981 | of extra system memory. Decreasing this amount will mean | ||
982 | SquashFS uses less memory at the expense of extra reads from disk. | ||
983 | |||
984 | Note there must be at least one cached fragment. Anything | ||
985 | much more than three will probably not make much difference. | ||
986 | |||
903 | config VXFS_FS | 987 | config VXFS_FS |
904 | tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" | 988 | tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" |
905 | depends on BLOCK | 989 | depends on BLOCK |
@@ -1091,7 +1175,7 @@ config UFS_DEBUG | |||
1091 | Y here. This will result in _many_ additional debugging messages to be | 1175 | Y here. This will result in _many_ additional debugging messages to be |
1092 | written to the system log. | 1176 | written to the system log. |
1093 | 1177 | ||
1094 | endmenu | 1178 | endif # MISC_FILESYSTEMS |
1095 | 1179 | ||
1096 | menuconfig NETWORK_FILESYSTEMS | 1180 | menuconfig NETWORK_FILESYSTEMS |
1097 | bool "Network File Systems" | 1181 | bool "Network File Systems" |
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index ce9fb3fbfae4..bb4cc5b8abc8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt | |||
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC | |||
43 | config CORE_DUMP_DEFAULT_ELF_HEADERS | 43 | config CORE_DUMP_DEFAULT_ELF_HEADERS |
44 | bool "Write ELF core dumps with partial segments" | 44 | bool "Write ELF core dumps with partial segments" |
45 | default n | 45 | default n |
46 | depends on BINFMT_ELF | 46 | depends on BINFMT_ELF && ELF_CORE |
47 | help | 47 | help |
48 | ELF core dump files describe each memory mapping of the crashed | 48 | ELF core dump files describe each memory mapping of the crashed |
49 | process, and can contain or omit the memory contents of each one. | 49 | process, and can contain or omit the memory contents of each one. |
diff --git a/fs/Makefile b/fs/Makefile index c830611550d3..38bc735c67ad 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -74,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/ | |||
74 | obj-$(CONFIG_JBD2) += jbd2/ | 74 | obj-$(CONFIG_JBD2) += jbd2/ |
75 | obj-$(CONFIG_EXT2_FS) += ext2/ | 75 | obj-$(CONFIG_EXT2_FS) += ext2/ |
76 | obj-$(CONFIG_CRAMFS) += cramfs/ | 76 | obj-$(CONFIG_CRAMFS) += cramfs/ |
77 | obj-$(CONFIG_SQUASHFS) += squashfs/ | ||
77 | obj-y += ramfs/ | 78 | obj-y += ramfs/ |
78 | obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ | 79 | obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ |
79 | obj-$(CONFIG_CODA_FS) += coda/ | 80 | obj-$(CONFIG_CODA_FS) += coda/ |
@@ -119,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/ | |||
119 | obj-$(CONFIG_HPPFS) += hppfs/ | 120 | obj-$(CONFIG_HPPFS) += hppfs/ |
120 | obj-$(CONFIG_DEBUG_FS) += debugfs/ | 121 | obj-$(CONFIG_DEBUG_FS) += debugfs/ |
121 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ | 122 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ |
123 | obj-$(CONFIG_BTRFS_FS) += btrfs/ | ||
122 | obj-$(CONFIG_GFS2_FS) += gfs2/ | 124 | obj-$(CONFIG_GFS2_FS) += gfs2/ |
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index e0f16da00e54..a76803108d06 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h | |||
@@ -25,8 +25,6 @@ | |||
25 | #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) | 25 | #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) |
26 | #define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) | 26 | #define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) |
27 | 27 | ||
28 | #define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) | ||
29 | |||
30 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
31 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
32 | #include <linux/time.h> | 30 | #include <linux/time.h> |
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 63b7c7afe8df..025e105bffea 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c | |||
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) | |||
124 | 124 | ||
125 | /* | 125 | /* |
126 | * Check sanity of parameter control fields and if a path is present | 126 | * Check sanity of parameter control fields and if a path is present |
127 | * check that it has a "/" and is terminated. | 127 | * check that it is terminated and contains at least one "/". |
128 | */ | 128 | */ |
129 | static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) | 129 | static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) |
130 | { | 130 | { |
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) | |||
138 | } | 138 | } |
139 | 139 | ||
140 | if (param->size > sizeof(*param)) { | 140 | if (param->size > sizeof(*param)) { |
141 | err = check_name(param->path); | 141 | err = invalid_str(param->path, |
142 | (void *) ((size_t) param + param->size)); | ||
142 | if (err) { | 143 | if (err) { |
143 | AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", | 144 | AUTOFS_WARN( |
144 | cmd); | 145 | "path string terminator missing for cmd(0x%08x)", |
146 | cmd); | ||
145 | goto out; | 147 | goto out; |
146 | } | 148 | } |
147 | 149 | ||
148 | err = invalid_str(param->path, | 150 | err = check_name(param->path); |
149 | (void *) ((size_t) param + param->size)); | ||
150 | if (err) { | 151 | if (err) { |
151 | AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", | 152 | AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", |
152 | cmd); | 153 | cmd); |
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp, | |||
180 | struct autofs_sb_info *sbi, | 181 | struct autofs_sb_info *sbi, |
181 | struct autofs_dev_ioctl *param) | 182 | struct autofs_dev_ioctl *param) |
182 | { | 183 | { |
183 | param->arg1 = sbi->version; | 184 | param->protover.version = sbi->version; |
184 | return 0; | 185 | return 0; |
185 | } | 186 | } |
186 | 187 | ||
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, | |||
189 | struct autofs_sb_info *sbi, | 190 | struct autofs_sb_info *sbi, |
190 | struct autofs_dev_ioctl *param) | 191 | struct autofs_dev_ioctl *param) |
191 | { | 192 | { |
192 | param->arg1 = sbi->sub_version; | 193 | param->protosubver.sub_version = sbi->sub_version; |
193 | return 0; | 194 | return 0; |
194 | } | 195 | } |
195 | 196 | ||
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp, | |||
335 | int err, fd; | 336 | int err, fd; |
336 | 337 | ||
337 | /* param->path has already been checked */ | 338 | /* param->path has already been checked */ |
338 | if (!param->arg1) | 339 | if (!param->openmount.devid) |
339 | return -EINVAL; | 340 | return -EINVAL; |
340 | 341 | ||
341 | param->ioctlfd = -1; | 342 | param->ioctlfd = -1; |
342 | 343 | ||
343 | path = param->path; | 344 | path = param->path; |
344 | devid = param->arg1; | 345 | devid = param->openmount.devid; |
345 | 346 | ||
346 | err = 0; | 347 | err = 0; |
347 | fd = autofs_dev_ioctl_open_mountpoint(path, devid); | 348 | fd = autofs_dev_ioctl_open_mountpoint(path, devid); |
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, | |||
373 | { | 374 | { |
374 | autofs_wqt_t token; | 375 | autofs_wqt_t token; |
375 | 376 | ||
376 | token = (autofs_wqt_t) param->arg1; | 377 | token = (autofs_wqt_t) param->ready.token; |
377 | return autofs4_wait_release(sbi, token, 0); | 378 | return autofs4_wait_release(sbi, token, 0); |
378 | } | 379 | } |
379 | 380 | ||
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp, | |||
388 | autofs_wqt_t token; | 389 | autofs_wqt_t token; |
389 | int status; | 390 | int status; |
390 | 391 | ||
391 | token = (autofs_wqt_t) param->arg1; | 392 | token = (autofs_wqt_t) param->fail.token; |
392 | status = param->arg2 ? param->arg2 : -ENOENT; | 393 | status = param->fail.status ? param->fail.status : -ENOENT; |
393 | return autofs4_wait_release(sbi, token, status); | 394 | return autofs4_wait_release(sbi, token, status); |
394 | } | 395 | } |
395 | 396 | ||
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, | |||
412 | int pipefd; | 413 | int pipefd; |
413 | int err = 0; | 414 | int err = 0; |
414 | 415 | ||
415 | if (param->arg1 == -1) | 416 | if (param->setpipefd.pipefd == -1) |
416 | return -EINVAL; | 417 | return -EINVAL; |
417 | 418 | ||
418 | pipefd = param->arg1; | 419 | pipefd = param->setpipefd.pipefd; |
419 | 420 | ||
420 | mutex_lock(&sbi->wq_mutex); | 421 | mutex_lock(&sbi->wq_mutex); |
421 | if (!sbi->catatonic) { | 422 | if (!sbi->catatonic) { |
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, | |||
457 | { | 458 | { |
458 | unsigned long timeout; | 459 | unsigned long timeout; |
459 | 460 | ||
460 | timeout = param->arg1; | 461 | timeout = param->timeout.timeout; |
461 | param->arg1 = sbi->exp_timeout / HZ; | 462 | param->timeout.timeout = sbi->exp_timeout / HZ; |
462 | sbi->exp_timeout = timeout * HZ; | 463 | sbi->exp_timeout = timeout * HZ; |
463 | return 0; | 464 | return 0; |
464 | } | 465 | } |
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, | |||
489 | path = param->path; | 490 | path = param->path; |
490 | devid = sbi->sb->s_dev; | 491 | devid = sbi->sb->s_dev; |
491 | 492 | ||
492 | param->arg1 = param->arg2 = -1; | 493 | param->requester.uid = param->requester.gid = -1; |
493 | 494 | ||
494 | /* Get nameidata of the parent directory */ | 495 | /* Get nameidata of the parent directory */ |
495 | err = path_lookup(path, LOOKUP_PARENT, &nd); | 496 | err = path_lookup(path, LOOKUP_PARENT, &nd); |
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, | |||
505 | err = 0; | 506 | err = 0; |
506 | autofs4_expire_wait(nd.path.dentry); | 507 | autofs4_expire_wait(nd.path.dentry); |
507 | spin_lock(&sbi->fs_lock); | 508 | spin_lock(&sbi->fs_lock); |
508 | param->arg1 = ino->uid; | 509 | param->requester.uid = ino->uid; |
509 | param->arg2 = ino->gid; | 510 | param->requester.gid = ino->gid; |
510 | spin_unlock(&sbi->fs_lock); | 511 | spin_unlock(&sbi->fs_lock); |
511 | } | 512 | } |
512 | 513 | ||
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp, | |||
529 | int err = -EAGAIN; | 530 | int err = -EAGAIN; |
530 | int how; | 531 | int how; |
531 | 532 | ||
532 | how = param->arg1; | 533 | how = param->expire.how; |
533 | mnt = fp->f_path.mnt; | 534 | mnt = fp->f_path.mnt; |
534 | 535 | ||
535 | if (sbi->type & AUTOFS_TYPE_TRIGGER) | 536 | if (autofs_type_trigger(sbi->type)) |
536 | dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); | 537 | dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); |
537 | else | 538 | else |
538 | dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); | 539 | dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); |
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp, | |||
565 | struct autofs_sb_info *sbi, | 566 | struct autofs_sb_info *sbi, |
566 | struct autofs_dev_ioctl *param) | 567 | struct autofs_dev_ioctl *param) |
567 | { | 568 | { |
568 | param->arg1 = 0; | 569 | param->askumount.may_umount = 0; |
569 | if (may_umount(fp->f_path.mnt)) | 570 | if (may_umount(fp->f_path.mnt)) |
570 | param->arg1 = 1; | 571 | param->askumount.may_umount = 1; |
571 | return 0; | 572 | return 0; |
572 | } | 573 | } |
573 | 574 | ||
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, | |||
600 | struct nameidata nd; | 601 | struct nameidata nd; |
601 | const char *path; | 602 | const char *path; |
602 | unsigned int type; | 603 | unsigned int type; |
604 | unsigned int devid, magic; | ||
603 | int err = -ENOENT; | 605 | int err = -ENOENT; |
604 | 606 | ||
605 | if (param->size <= sizeof(*param)) { | 607 | if (param->size <= sizeof(*param)) { |
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, | |||
608 | } | 610 | } |
609 | 611 | ||
610 | path = param->path; | 612 | path = param->path; |
611 | type = param->arg1; | 613 | type = param->ismountpoint.in.type; |
612 | 614 | ||
613 | param->arg1 = 0; | 615 | param->ismountpoint.out.devid = devid = 0; |
614 | param->arg2 = 0; | 616 | param->ismountpoint.out.magic = magic = 0; |
615 | 617 | ||
616 | if (!fp || param->ioctlfd == -1) { | 618 | if (!fp || param->ioctlfd == -1) { |
617 | if (type == AUTOFS_TYPE_ANY) { | 619 | if (autofs_type_any(type)) { |
618 | struct super_block *sb; | 620 | struct super_block *sb; |
619 | 621 | ||
620 | err = path_lookup(path, LOOKUP_FOLLOW, &nd); | 622 | err = path_lookup(path, LOOKUP_FOLLOW, &nd); |
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, | |||
622 | goto out; | 624 | goto out; |
623 | 625 | ||
624 | sb = nd.path.dentry->d_sb; | 626 | sb = nd.path.dentry->d_sb; |
625 | param->arg1 = new_encode_dev(sb->s_dev); | 627 | devid = new_encode_dev(sb->s_dev); |
626 | } else { | 628 | } else { |
627 | struct autofs_info *ino; | 629 | struct autofs_info *ino; |
628 | 630 | ||
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, | |||
635 | goto out_release; | 637 | goto out_release; |
636 | 638 | ||
637 | ino = autofs4_dentry_ino(nd.path.dentry); | 639 | ino = autofs4_dentry_ino(nd.path.dentry); |
638 | param->arg1 = autofs4_get_dev(ino->sbi); | 640 | devid = autofs4_get_dev(ino->sbi); |
639 | } | 641 | } |
640 | 642 | ||
641 | err = 0; | 643 | err = 0; |
642 | if (nd.path.dentry->d_inode && | 644 | if (nd.path.dentry->d_inode && |
643 | nd.path.mnt->mnt_root == nd.path.dentry) { | 645 | nd.path.mnt->mnt_root == nd.path.dentry) { |
644 | err = 1; | 646 | err = 1; |
645 | param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; | 647 | magic = nd.path.dentry->d_inode->i_sb->s_magic; |
646 | } | 648 | } |
647 | } else { | 649 | } else { |
648 | dev_t devid = new_encode_dev(sbi->sb->s_dev); | 650 | dev_t dev = autofs4_get_dev(sbi); |
649 | 651 | ||
650 | err = path_lookup(path, LOOKUP_PARENT, &nd); | 652 | err = path_lookup(path, LOOKUP_PARENT, &nd); |
651 | if (err) | 653 | if (err) |
652 | goto out; | 654 | goto out; |
653 | 655 | ||
654 | err = autofs_dev_ioctl_find_super(&nd, devid); | 656 | err = autofs_dev_ioctl_find_super(&nd, dev); |
655 | if (err) | 657 | if (err) |
656 | goto out_release; | 658 | goto out_release; |
657 | 659 | ||
658 | param->arg1 = autofs4_get_dev(sbi); | 660 | devid = dev; |
659 | 661 | ||
660 | err = have_submounts(nd.path.dentry); | 662 | err = have_submounts(nd.path.dentry); |
661 | 663 | ||
662 | if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { | 664 | if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { |
663 | if (follow_down(&nd.path.mnt, &nd.path.dentry)) { | 665 | if (follow_down(&nd.path.mnt, &nd.path.dentry)) { |
664 | struct inode *inode = nd.path.dentry->d_inode; | 666 | struct inode *inode = nd.path.dentry->d_inode; |
665 | param->arg2 = inode->i_sb->s_magic; | 667 | magic = inode->i_sb->s_magic; |
666 | } | 668 | } |
667 | } | 669 | } |
668 | } | 670 | } |
669 | 671 | ||
672 | param->ismountpoint.out.devid = devid; | ||
673 | param->ismountpoint.out.magic = magic; | ||
674 | |||
670 | out_release: | 675 | out_release: |
671 | path_put(&nd.path); | 676 | path_put(&nd.path); |
672 | out: | 677 | out: |
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4b6fb3f628c0..e3bd50776f9e 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c | |||
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) | |||
63 | struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); | 63 | struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); |
64 | 64 | ||
65 | /* This is an autofs submount, we can't expire it */ | 65 | /* This is an autofs submount, we can't expire it */ |
66 | if (sbi->type == AUTOFS_TYPE_INDIRECT) | 66 | if (autofs_type_indirect(sbi->type)) |
67 | goto done; | 67 | goto done; |
68 | 68 | ||
69 | /* | 69 | /* |
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, | |||
490 | if (arg && get_user(do_now, arg)) | 490 | if (arg && get_user(do_now, arg)) |
491 | return -EFAULT; | 491 | return -EFAULT; |
492 | 492 | ||
493 | if (sbi->type & AUTOFS_TYPE_TRIGGER) | 493 | if (autofs_type_trigger(sbi->type)) |
494 | dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); | 494 | dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); |
495 | else | 495 | else |
496 | dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); | 496 | dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); |
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index cfc23e53b6f4..716e12b627b2 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c | |||
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
197 | seq_printf(m, ",minproto=%d", sbi->min_proto); | 197 | seq_printf(m, ",minproto=%d", sbi->min_proto); |
198 | seq_printf(m, ",maxproto=%d", sbi->max_proto); | 198 | seq_printf(m, ",maxproto=%d", sbi->max_proto); |
199 | 199 | ||
200 | if (sbi->type & AUTOFS_TYPE_OFFSET) | 200 | if (autofs_type_offset(sbi->type)) |
201 | seq_printf(m, ",offset"); | 201 | seq_printf(m, ",offset"); |
202 | else if (sbi->type & AUTOFS_TYPE_DIRECT) | 202 | else if (autofs_type_direct(sbi->type)) |
203 | seq_printf(m, ",direct"); | 203 | seq_printf(m, ",direct"); |
204 | else | 204 | else |
205 | seq_printf(m, ",indirect"); | 205 | seq_printf(m, ",indirect"); |
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, | |||
284 | *maxproto = option; | 284 | *maxproto = option; |
285 | break; | 285 | break; |
286 | case Opt_indirect: | 286 | case Opt_indirect: |
287 | *type = AUTOFS_TYPE_INDIRECT; | 287 | set_autofs_type_indirect(type); |
288 | break; | 288 | break; |
289 | case Opt_direct: | 289 | case Opt_direct: |
290 | *type = AUTOFS_TYPE_DIRECT; | 290 | set_autofs_type_direct(type); |
291 | break; | 291 | break; |
292 | case Opt_offset: | 292 | case Opt_offset: |
293 | *type = AUTOFS_TYPE_OFFSET; | 293 | set_autofs_type_offset(type); |
294 | break; | 294 | break; |
295 | default: | 295 | default: |
296 | return 1; | 296 | return 1; |
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) | |||
338 | sbi->sb = s; | 338 | sbi->sb = s; |
339 | sbi->version = 0; | 339 | sbi->version = 0; |
340 | sbi->sub_version = 0; | 340 | sbi->sub_version = 0; |
341 | sbi->type = AUTOFS_TYPE_INDIRECT; | 341 | set_autofs_type_indirect(&sbi->type); |
342 | sbi->min_proto = 0; | 342 | sbi->min_proto = 0; |
343 | sbi->max_proto = 0; | 343 | sbi->max_proto = 0; |
344 | mutex_init(&sbi->wq_mutex); | 344 | mutex_init(&sbi->wq_mutex); |
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) | |||
380 | } | 380 | } |
381 | 381 | ||
382 | root_inode->i_fop = &autofs4_root_operations; | 382 | root_inode->i_fop = &autofs4_root_operations; |
383 | root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? | 383 | root_inode->i_op = autofs_type_trigger(sbi->type) ? |
384 | &autofs4_direct_root_inode_operations : | 384 | &autofs4_direct_root_inode_operations : |
385 | &autofs4_indirect_root_inode_operations; | 385 | &autofs4_indirect_root_inode_operations; |
386 | 386 | ||
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e02cc8ae5eb3..eeb246845909 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c | |||
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, | |||
337 | * is very similar for indirect mounts except only dentrys | 337 | * is very similar for indirect mounts except only dentrys |
338 | * in the root of the autofs file system may be negative. | 338 | * in the root of the autofs file system may be negative. |
339 | */ | 339 | */ |
340 | if (sbi->type & AUTOFS_TYPE_TRIGGER) | 340 | if (autofs_type_trigger(sbi->type)) |
341 | return -ENOENT; | 341 | return -ENOENT; |
342 | else if (!IS_ROOT(dentry->d_parent)) | 342 | else if (!IS_ROOT(dentry->d_parent)) |
343 | return -ENOENT; | 343 | return -ENOENT; |
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, | |||
348 | return -ENOMEM; | 348 | return -ENOMEM; |
349 | 349 | ||
350 | /* If this is a direct mount request create a dummy name */ | 350 | /* If this is a direct mount request create a dummy name */ |
351 | if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) | 351 | if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) |
352 | qstr.len = sprintf(name, "%p", dentry); | 352 | qstr.len = sprintf(name, "%p", dentry); |
353 | else { | 353 | else { |
354 | qstr.len = autofs4_getpath(sbi, dentry, &name); | 354 | qstr.len = autofs4_getpath(sbi, dentry, &name); |
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, | |||
406 | type = autofs_ptype_expire_multi; | 406 | type = autofs_ptype_expire_multi; |
407 | } else { | 407 | } else { |
408 | if (notify == NFY_MOUNT) | 408 | if (notify == NFY_MOUNT) |
409 | type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? | 409 | type = autofs_type_trigger(sbi->type) ? |
410 | autofs_ptype_missing_direct : | 410 | autofs_ptype_missing_direct : |
411 | autofs_ptype_missing_indirect; | 411 | autofs_ptype_missing_indirect; |
412 | else | 412 | else |
413 | type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? | 413 | type = autofs_type_trigger(sbi->type) ? |
414 | autofs_ptype_expire_direct : | 414 | autofs_ptype_expire_direct : |
415 | autofs_ptype_expire_indirect; | 415 | autofs_ptype_expire_indirect; |
416 | } | 416 | } |
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 0ed57b5ee012..cc4062d12ca2 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c | |||
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s) | |||
213 | { | 213 | { |
214 | struct bfs_sb_info *info = BFS_SB(s); | 214 | struct bfs_sb_info *info = BFS_SB(s); |
215 | 215 | ||
216 | if (!info) | ||
217 | return; | ||
218 | |||
216 | brelse(info->si_sbh); | 219 | brelse(info->si_sbh); |
217 | mutex_destroy(&info->bfs_lock); | 220 | mutex_destroy(&info->bfs_lock); |
218 | kfree(info->si_imap); | 221 | kfree(info->si_imap); |
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) | |||
327 | unsigned i, imap_len; | 330 | unsigned i, imap_len; |
328 | struct bfs_sb_info *info; | 331 | struct bfs_sb_info *info; |
329 | long ret = -EINVAL; | 332 | long ret = -EINVAL; |
333 | unsigned long i_sblock, i_eblock, i_eoff, s_size; | ||
330 | 334 | ||
331 | info = kzalloc(sizeof(*info), GFP_KERNEL); | 335 | info = kzalloc(sizeof(*info), GFP_KERNEL); |
332 | if (!info) | 336 | if (!info) |
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) | |||
350 | 354 | ||
351 | s->s_magic = BFS_MAGIC; | 355 | s->s_magic = BFS_MAGIC; |
352 | info->si_sbh = bh; | 356 | info->si_sbh = bh; |
357 | |||
358 | if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { | ||
359 | printf("Superblock is corrupted\n"); | ||
360 | goto out; | ||
361 | } | ||
362 | |||
353 | info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / | 363 | info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / |
354 | sizeof(struct bfs_inode) | 364 | sizeof(struct bfs_inode) |
355 | + BFS_ROOT_INO - 1; | 365 | + BFS_ROOT_INO - 1; |
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) | |||
380 | - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; | 390 | - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; |
381 | info->si_freei = 0; | 391 | info->si_freei = 0; |
382 | info->si_lf_eblk = 0; | 392 | info->si_lf_eblk = 0; |
393 | |||
394 | /* can we read the last block? */ | ||
395 | bh = sb_bread(s, info->si_blocks - 1); | ||
396 | if (!bh) { | ||
397 | printf("Last block not available: %lu\n", info->si_blocks - 1); | ||
398 | iput(inode); | ||
399 | ret = -EIO; | ||
400 | kfree(info->si_imap); | ||
401 | goto out; | ||
402 | } | ||
403 | brelse(bh); | ||
404 | |||
383 | bh = NULL; | 405 | bh = NULL; |
384 | for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { | 406 | for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { |
385 | struct bfs_inode *di; | 407 | struct bfs_inode *di; |
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) | |||
397 | 419 | ||
398 | di = (struct bfs_inode *)bh->b_data + off; | 420 | di = (struct bfs_inode *)bh->b_data + off; |
399 | 421 | ||
422 | /* test if filesystem is not corrupted */ | ||
423 | |||
424 | i_eoff = le32_to_cpu(di->i_eoffset); | ||
425 | i_sblock = le32_to_cpu(di->i_sblock); | ||
426 | i_eblock = le32_to_cpu(di->i_eblock); | ||
427 | s_size = le32_to_cpu(bfs_sb->s_end); | ||
428 | |||
429 | if (i_sblock > info->si_blocks || | ||
430 | i_eblock > info->si_blocks || | ||
431 | i_sblock > i_eblock || | ||
432 | i_eoff > s_size || | ||
433 | i_sblock * BFS_BSIZE > i_eoff) { | ||
434 | |||
435 | printf("Inode 0x%08x corrupted\n", i); | ||
436 | |||
437 | brelse(bh); | ||
438 | s->s_root = NULL; | ||
439 | kfree(info->si_imap); | ||
440 | kfree(info); | ||
441 | s->s_fs_info = NULL; | ||
442 | return -EIO; | ||
443 | } | ||
444 | |||
400 | if (!di->i_ino) { | 445 | if (!di->i_ino) { |
401 | info->si_freei++; | 446 | info->si_freei++; |
402 | continue; | 447 | continue; |
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c41fa2af7677..e3ff2b9e602f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, | |||
152 | elf_addr_t __user *sp; | 152 | elf_addr_t __user *sp; |
153 | elf_addr_t __user *u_platform; | 153 | elf_addr_t __user *u_platform; |
154 | elf_addr_t __user *u_base_platform; | 154 | elf_addr_t __user *u_base_platform; |
155 | elf_addr_t __user *u_rand_bytes; | ||
155 | const char *k_platform = ELF_PLATFORM; | 156 | const char *k_platform = ELF_PLATFORM; |
156 | const char *k_base_platform = ELF_BASE_PLATFORM; | 157 | const char *k_base_platform = ELF_BASE_PLATFORM; |
158 | unsigned char k_rand_bytes[16]; | ||
157 | int items; | 159 | int items; |
158 | elf_addr_t *elf_info; | 160 | elf_addr_t *elf_info; |
159 | int ei_index = 0; | 161 | int ei_index = 0; |
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, | |||
196 | return -EFAULT; | 198 | return -EFAULT; |
197 | } | 199 | } |
198 | 200 | ||
201 | /* | ||
202 | * Generate 16 random bytes for userspace PRNG seeding. | ||
203 | */ | ||
204 | get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); | ||
205 | u_rand_bytes = (elf_addr_t __user *) | ||
206 | STACK_ALLOC(p, sizeof(k_rand_bytes)); | ||
207 | if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) | ||
208 | return -EFAULT; | ||
209 | |||
199 | /* Create the ELF interpreter info */ | 210 | /* Create the ELF interpreter info */ |
200 | elf_info = (elf_addr_t *)current->mm->saved_auxv; | 211 | elf_info = (elf_addr_t *)current->mm->saved_auxv; |
201 | /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ | 212 | /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ |
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, | |||
228 | NEW_AUX_ENT(AT_GID, cred->gid); | 239 | NEW_AUX_ENT(AT_GID, cred->gid); |
229 | NEW_AUX_ENT(AT_EGID, cred->egid); | 240 | NEW_AUX_ENT(AT_EGID, cred->egid); |
230 | NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); | 241 | NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); |
242 | NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); | ||
231 | NEW_AUX_ENT(AT_EXECFN, bprm->exec); | 243 | NEW_AUX_ENT(AT_EXECFN, bprm->exec); |
232 | if (k_platform) { | 244 | if (k_platform) { |
233 | NEW_AUX_ENT(AT_PLATFORM, | 245 | NEW_AUX_ENT(AT_PLATFORM, |
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index aa5b43205e37..f3e72c5c19f5 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c | |||
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, | |||
168 | struct elf_fdpic_params exec_params, interp_params; | 168 | struct elf_fdpic_params exec_params, interp_params; |
169 | struct elf_phdr *phdr; | 169 | struct elf_phdr *phdr; |
170 | unsigned long stack_size, entryaddr; | 170 | unsigned long stack_size, entryaddr; |
171 | #ifndef CONFIG_MMU | ||
172 | unsigned long fullsize; | ||
173 | #endif | ||
174 | #ifdef ELF_FDPIC_PLAT_INIT | 171 | #ifdef ELF_FDPIC_PLAT_INIT |
175 | unsigned long dynaddr; | 172 | unsigned long dynaddr; |
176 | #endif | 173 | #endif |
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, | |||
390 | goto error_kill; | 387 | goto error_kill; |
391 | } | 388 | } |
392 | 389 | ||
393 | /* expand the stack mapping to use up the entire allocation granule */ | ||
394 | fullsize = kobjsize((char *) current->mm->start_brk); | ||
395 | if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, | ||
396 | fullsize, 0, 0))) | ||
397 | stack_size = fullsize; | ||
398 | up_write(¤t->mm->mmap_sem); | 390 | up_write(¤t->mm->mmap_sem); |
399 | 391 | ||
400 | current->mm->brk = current->mm->start_brk; | 392 | current->mm->brk = current->mm->start_brk; |
@@ -1567,11 +1559,9 @@ end_coredump: | |||
1567 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, | 1559 | static int elf_fdpic_dump_segments(struct file *file, size_t *size, |
1568 | unsigned long *limit, unsigned long mm_flags) | 1560 | unsigned long *limit, unsigned long mm_flags) |
1569 | { | 1561 | { |
1570 | struct vm_list_struct *vml; | 1562 | struct vm_area_struct *vma; |
1571 | |||
1572 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) { | ||
1573 | struct vm_area_struct *vma = vml->vma; | ||
1574 | 1563 | ||
1564 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { | ||
1575 | if (!maydump(vma, mm_flags)) | 1565 | if (!maydump(vma, mm_flags)) |
1576 | continue; | 1566 | continue; |
1577 | 1567 | ||
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, | |||
1617 | elf_fpxregset_t *xfpu = NULL; | 1607 | elf_fpxregset_t *xfpu = NULL; |
1618 | #endif | 1608 | #endif |
1619 | int thread_status_size = 0; | 1609 | int thread_status_size = 0; |
1620 | #ifndef CONFIG_MMU | ||
1621 | struct vm_list_struct *vml; | ||
1622 | #endif | ||
1623 | elf_addr_t *auxv; | 1610 | elf_addr_t *auxv; |
1624 | unsigned long mm_flags; | 1611 | unsigned long mm_flags; |
1625 | 1612 | ||
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, | |||
1685 | fill_prstatus(prstatus, current, signr); | 1672 | fill_prstatus(prstatus, current, signr); |
1686 | elf_core_copy_regs(&prstatus->pr_reg, regs); | 1673 | elf_core_copy_regs(&prstatus->pr_reg, regs); |
1687 | 1674 | ||
1688 | #ifdef CONFIG_MMU | ||
1689 | segs = current->mm->map_count; | 1675 | segs = current->mm->map_count; |
1690 | #else | ||
1691 | segs = 0; | ||
1692 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | ||
1693 | segs++; | ||
1694 | #endif | ||
1695 | #ifdef ELF_CORE_EXTRA_PHDRS | 1676 | #ifdef ELF_CORE_EXTRA_PHDRS |
1696 | segs += ELF_CORE_EXTRA_PHDRS; | 1677 | segs += ELF_CORE_EXTRA_PHDRS; |
1697 | #endif | 1678 | #endif |
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, | |||
1766 | mm_flags = current->mm->flags; | 1747 | mm_flags = current->mm->flags; |
1767 | 1748 | ||
1768 | /* write program headers for segments dump */ | 1749 | /* write program headers for segments dump */ |
1769 | for ( | 1750 | for (vma = current->mm->mmap; vma; vma = vma->vm_next) { |
1770 | #ifdef CONFIG_MMU | ||
1771 | vma = current->mm->mmap; vma; vma = vma->vm_next | ||
1772 | #else | ||
1773 | vml = current->mm->context.vmlist; vml; vml = vml->next | ||
1774 | #endif | ||
1775 | ) { | ||
1776 | struct elf_phdr phdr; | 1751 | struct elf_phdr phdr; |
1777 | size_t sz; | 1752 | size_t sz; |
1778 | 1753 | ||
1779 | #ifndef CONFIG_MMU | ||
1780 | vma = vml->vma; | ||
1781 | #endif | ||
1782 | |||
1783 | sz = vma->vm_end - vma->vm_start; | 1754 | sz = vma->vm_end - vma->vm_start; |
1784 | 1755 | ||
1785 | phdr.p_type = PT_LOAD; | 1756 | phdr.p_type = PT_LOAD; |
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 7bbd5c6b3725..5cebf0b37798 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c | |||
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
417 | unsigned long textpos = 0, datapos = 0, result; | 417 | unsigned long textpos = 0, datapos = 0, result; |
418 | unsigned long realdatastart = 0; | 418 | unsigned long realdatastart = 0; |
419 | unsigned long text_len, data_len, bss_len, stack_len, flags; | 419 | unsigned long text_len, data_len, bss_len, stack_len, flags; |
420 | unsigned long len, reallen, memp = 0; | 420 | unsigned long len, memp = 0; |
421 | unsigned long extra, rlim; | 421 | unsigned long memp_size, extra, rlim; |
422 | unsigned long *reloc = 0, *rp; | 422 | unsigned long *reloc = 0, *rp; |
423 | struct inode *inode; | 423 | struct inode *inode; |
424 | int i, rev, relocs = 0; | 424 | int i, rev, relocs = 0; |
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
543 | } | 543 | } |
544 | 544 | ||
545 | len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); | 545 | len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); |
546 | len = PAGE_ALIGN(len); | ||
546 | down_write(¤t->mm->mmap_sem); | 547 | down_write(¤t->mm->mmap_sem); |
547 | realdatastart = do_mmap(0, 0, len, | 548 | realdatastart = do_mmap(0, 0, len, |
548 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); | 549 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); |
549 | /* Remap to use all availabe slack region space */ | ||
550 | if (realdatastart && (realdatastart < (unsigned long)-4096)) { | ||
551 | reallen = kobjsize((void *)realdatastart); | ||
552 | if (reallen > len) { | ||
553 | realdatastart = do_mremap(realdatastart, len, | ||
554 | reallen, MREMAP_FIXED, realdatastart); | ||
555 | } | ||
556 | } | ||
557 | up_write(¤t->mm->mmap_sem); | 550 | up_write(¤t->mm->mmap_sem); |
558 | 551 | ||
559 | if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { | 552 | if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { |
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
591 | 584 | ||
592 | reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); | 585 | reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); |
593 | memp = realdatastart; | 586 | memp = realdatastart; |
594 | 587 | memp_size = len; | |
595 | } else { | 588 | } else { |
596 | 589 | ||
597 | len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); | 590 | len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); |
591 | len = PAGE_ALIGN(len); | ||
598 | down_write(¤t->mm->mmap_sem); | 592 | down_write(¤t->mm->mmap_sem); |
599 | textpos = do_mmap(0, 0, len, | 593 | textpos = do_mmap(0, 0, len, |
600 | PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); | 594 | PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); |
601 | /* Remap to use all availabe slack region space */ | ||
602 | if (textpos && (textpos < (unsigned long) -4096)) { | ||
603 | reallen = kobjsize((void *)textpos); | ||
604 | if (reallen > len) { | ||
605 | textpos = do_mremap(textpos, len, reallen, | ||
606 | MREMAP_FIXED, textpos); | ||
607 | } | ||
608 | } | ||
609 | up_write(¤t->mm->mmap_sem); | 595 | up_write(¤t->mm->mmap_sem); |
610 | 596 | ||
611 | if (!textpos || textpos >= (unsigned long) -4096) { | 597 | if (!textpos || textpos >= (unsigned long) -4096) { |
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
622 | reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + | 608 | reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + |
623 | MAX_SHARED_LIBS * sizeof(unsigned long)); | 609 | MAX_SHARED_LIBS * sizeof(unsigned long)); |
624 | memp = textpos; | 610 | memp = textpos; |
625 | 611 | memp_size = len; | |
626 | #ifdef CONFIG_BINFMT_ZFLAT | 612 | #ifdef CONFIG_BINFMT_ZFLAT |
627 | /* | 613 | /* |
628 | * load it all in and treat it like a RAM load from now on | 614 | * load it all in and treat it like a RAM load from now on |
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
680 | * set up the brk stuff, uses any slack left in data/bss/stack | 666 | * set up the brk stuff, uses any slack left in data/bss/stack |
681 | * allocation. We put the brk after the bss (between the bss | 667 | * allocation. We put the brk after the bss (between the bss |
682 | * and stack) like other platforms. | 668 | * and stack) like other platforms. |
669 | * Userspace code relies on the stack pointer starting out at | ||
670 | * an address right at the end of a page. | ||
683 | */ | 671 | */ |
684 | current->mm->start_brk = datapos + data_len + bss_len; | 672 | current->mm->start_brk = datapos + data_len + bss_len; |
685 | current->mm->brk = (current->mm->start_brk + 3) & ~3; | 673 | current->mm->brk = (current->mm->start_brk + 3) & ~3; |
686 | current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; | 674 | current->mm->context.end_brk = memp + memp_size - stack_len; |
687 | } | 675 | } |
688 | 676 | ||
689 | if (flags & FLAT_FLAG_KTRACE) | 677 | if (flags & FLAT_FLAG_KTRACE) |
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm, | |||
790 | 778 | ||
791 | /* zero the BSS, BRK and stack areas */ | 779 | /* zero the BSS, BRK and stack areas */ |
792 | memset((void*)(datapos + data_len), 0, bss_len + | 780 | memset((void*)(datapos + data_len), 0, bss_len + |
793 | (memp + kobjsize((void *) memp) - stack_len - /* end brk */ | 781 | (memp + memp_size - stack_len - /* end brk */ |
794 | libinfo->lib_list[id].start_brk) + /* start brk */ | 782 | libinfo->lib_list[id].start_brk) + /* start brk */ |
795 | stack_len); | 783 | stack_len); |
796 | 784 | ||
797 | return 0; | 785 | return 0; |
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1158cb4fbd6..c4e83537ead7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c | |||
@@ -649,7 +649,7 @@ static const struct file_operations bm_register_operations = { | |||
649 | static ssize_t | 649 | static ssize_t |
650 | bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) | 650 | bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) |
651 | { | 651 | { |
652 | char *s = enabled ? "enabled" : "disabled"; | 652 | char *s = enabled ? "enabled\n" : "disabled\n"; |
653 | 653 | ||
654 | return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); | 654 | return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); |
655 | } | 655 | } |
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, | |||
788 | int i, ret; | 788 | int i, ret; |
789 | int nr_pages = 0; | 789 | int nr_pages = 0; |
790 | unsigned int len = 0; | 790 | unsigned int len = 0; |
791 | unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; | ||
791 | 792 | ||
792 | for (i = 0; i < iov_count; i++) { | 793 | for (i = 0; i < iov_count; i++) { |
793 | unsigned long uaddr; | 794 | unsigned long uaddr; |
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q, | |||
814 | bio->bi_rw |= (!write_to_vm << BIO_RW); | 815 | bio->bi_rw |= (!write_to_vm << BIO_RW); |
815 | 816 | ||
816 | ret = 0; | 817 | ret = 0; |
817 | i = 0; | 818 | |
819 | if (map_data) { | ||
820 | nr_pages = 1 << map_data->page_order; | ||
821 | i = map_data->offset / PAGE_SIZE; | ||
822 | } | ||
818 | while (len) { | 823 | while (len) { |
819 | unsigned int bytes; | 824 | unsigned int bytes = PAGE_SIZE; |
820 | 825 | ||
821 | if (map_data) | 826 | bytes -= offset; |
822 | bytes = 1U << (PAGE_SHIFT + map_data->page_order); | ||
823 | else | ||
824 | bytes = PAGE_SIZE; | ||
825 | 827 | ||
826 | if (bytes > len) | 828 | if (bytes > len) |
827 | bytes = len; | 829 | bytes = len; |
828 | 830 | ||
829 | if (map_data) { | 831 | if (map_data) { |
830 | if (i == map_data->nr_entries) { | 832 | if (i == map_data->nr_entries * nr_pages) { |
831 | ret = -ENOMEM; | 833 | ret = -ENOMEM; |
832 | break; | 834 | break; |
833 | } | 835 | } |
834 | page = map_data->pages[i++]; | 836 | |
835 | } else | 837 | page = map_data->pages[i / nr_pages]; |
838 | page += (i % nr_pages); | ||
839 | |||
840 | i++; | ||
841 | } else { | ||
836 | page = alloc_page(q->bounce_gfp | gfp_mask); | 842 | page = alloc_page(q->bounce_gfp | gfp_mask); |
837 | if (!page) { | 843 | if (!page) { |
838 | ret = -ENOMEM; | 844 | ret = -ENOMEM; |
839 | break; | 845 | break; |
846 | } | ||
840 | } | 847 | } |
841 | 848 | ||
842 | if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) | 849 | if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) |
843 | break; | 850 | break; |
844 | 851 | ||
845 | len -= bytes; | 852 | len -= bytes; |
853 | offset = 0; | ||
846 | } | 854 | } |
847 | 855 | ||
848 | if (ret) | 856 | if (ret) |
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, | |||
851 | /* | 859 | /* |
852 | * success | 860 | * success |
853 | */ | 861 | */ |
854 | if (!write_to_vm) { | 862 | if (!write_to_vm && (!map_data || !map_data->null_mapped)) { |
855 | ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); | 863 | ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); |
856 | if (ret) | 864 | if (ret) |
857 | goto cleanup; | 865 | goto cleanup; |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 349a26c10001..b3c1efff5e1d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -285,6 +285,8 @@ static void init_once(void *foo) | |||
285 | INIT_LIST_HEAD(&bdev->bd_holder_list); | 285 | INIT_LIST_HEAD(&bdev->bd_holder_list); |
286 | #endif | 286 | #endif |
287 | inode_init_once(&ei->vfs_inode); | 287 | inode_init_once(&ei->vfs_inode); |
288 | /* Initialize mutex for freeze. */ | ||
289 | mutex_init(&bdev->bd_fsfreeze_mutex); | ||
288 | } | 290 | } |
289 | 291 | ||
290 | static inline void __bd_forget(struct inode *inode) | 292 | static inline void __bd_forget(struct inode *inode) |
@@ -1005,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1005 | } | 1007 | } |
1006 | 1008 | ||
1007 | lock_kernel(); | 1009 | lock_kernel(); |
1010 | restart: | ||
1008 | 1011 | ||
1009 | ret = -ENXIO; | 1012 | ret = -ENXIO; |
1010 | disk = get_gendisk(bdev->bd_dev, &partno); | 1013 | disk = get_gendisk(bdev->bd_dev, &partno); |
@@ -1025,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1025 | 1028 | ||
1026 | if (disk->fops->open) { | 1029 | if (disk->fops->open) { |
1027 | ret = disk->fops->open(bdev, mode); | 1030 | ret = disk->fops->open(bdev, mode); |
1031 | if (ret == -ERESTARTSYS) { | ||
1032 | /* Lost a race with 'disk' being | ||
1033 | * deleted, try again. | ||
1034 | * See md.c | ||
1035 | */ | ||
1036 | disk_put_part(bdev->bd_part); | ||
1037 | bdev->bd_part = NULL; | ||
1038 | module_put(disk->fops->owner); | ||
1039 | put_disk(disk); | ||
1040 | bdev->bd_disk = NULL; | ||
1041 | mutex_unlock(&bdev->bd_mutex); | ||
1042 | goto restart; | ||
1043 | } | ||
1028 | if (ret) | 1044 | if (ret) |
1029 | goto out_clear; | 1045 | goto out_clear; |
1030 | } | 1046 | } |
@@ -1220,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
1220 | return blkdev_ioctl(bdev, mode, cmd, arg); | 1236 | return blkdev_ioctl(bdev, mode, cmd, arg); |
1221 | } | 1237 | } |
1222 | 1238 | ||
1239 | /* | ||
1240 | * Try to release a page associated with block device when the system | ||
1241 | * is under memory pressure. | ||
1242 | */ | ||
1243 | static int blkdev_releasepage(struct page *page, gfp_t wait) | ||
1244 | { | ||
1245 | struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; | ||
1246 | |||
1247 | if (super && super->s_op->bdev_try_to_free_page) | ||
1248 | return super->s_op->bdev_try_to_free_page(super, page, wait); | ||
1249 | |||
1250 | return try_to_free_buffers(page); | ||
1251 | } | ||
1252 | |||
1223 | static const struct address_space_operations def_blk_aops = { | 1253 | static const struct address_space_operations def_blk_aops = { |
1224 | .readpage = blkdev_readpage, | 1254 | .readpage = blkdev_readpage, |
1225 | .writepage = blkdev_writepage, | 1255 | .writepage = blkdev_writepage, |
@@ -1227,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = { | |||
1227 | .write_begin = blkdev_write_begin, | 1257 | .write_begin = blkdev_write_begin, |
1228 | .write_end = blkdev_write_end, | 1258 | .write_end = blkdev_write_end, |
1229 | .writepages = generic_writepages, | 1259 | .writepages = generic_writepages, |
1260 | .releasepage = blkdev_releasepage, | ||
1230 | .direct_IO = blkdev_direct_IO, | 1261 | .direct_IO = blkdev_direct_IO, |
1231 | }; | 1262 | }; |
1232 | 1263 | ||
@@ -1262,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); | |||
1262 | 1293 | ||
1263 | /** | 1294 | /** |
1264 | * lookup_bdev - lookup a struct block_device by name | 1295 | * lookup_bdev - lookup a struct block_device by name |
1265 | * @path: special file representing the block device | 1296 | * @pathname: special file representing the block device |
1266 | * | 1297 | * |
1267 | * Get a reference to the blockdevice at @pathname in the current | 1298 | * Get a reference to the blockdevice at @pathname in the current |
1268 | * namespace if possible and return it. Return ERR_PTR(error) | 1299 | * namespace if possible and return it. Return ERR_PTR(error) |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 000000000000..d2cf5a54a4b8 --- /dev/null +++ b/fs/btrfs/Makefile | |||
@@ -0,0 +1,25 @@ | |||
1 | ifneq ($(KERNELRELEASE),) | ||
2 | # kbuild part of makefile | ||
3 | |||
4 | obj-$(CONFIG_BTRFS_FS) := btrfs.o | ||
5 | btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | ||
6 | file-item.o inode-item.o inode-map.o disk-io.o \ | ||
7 | transaction.o inode.o file.o tree-defrag.o \ | ||
8 | extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ | ||
9 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | ||
10 | ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ | ||
11 | compression.o | ||
12 | else | ||
13 | |||
14 | # Normal Makefile | ||
15 | |||
16 | KERNELDIR := /lib/modules/`uname -r`/build | ||
17 | all: | ||
18 | $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules | ||
19 | |||
20 | modules_install: | ||
21 | $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install | ||
22 | clean: | ||
23 | $(MAKE) -C $(KERNELDIR) M=`pwd` clean | ||
24 | |||
25 | endif | ||
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 000000000000..1d53b62dbba5 --- /dev/null +++ b/fs/btrfs/acl.c | |||
@@ -0,0 +1,351 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/xattr.h> | ||
22 | #include <linux/posix_acl_xattr.h> | ||
23 | #include <linux/posix_acl.h> | ||
24 | #include <linux/sched.h> | ||
25 | |||
26 | #include "ctree.h" | ||
27 | #include "btrfs_inode.h" | ||
28 | #include "xattr.h" | ||
29 | |||
30 | #ifdef CONFIG_FS_POSIX_ACL | ||
31 | |||
32 | static void btrfs_update_cached_acl(struct inode *inode, | ||
33 | struct posix_acl **p_acl, | ||
34 | struct posix_acl *acl) | ||
35 | { | ||
36 | spin_lock(&inode->i_lock); | ||
37 | if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) | ||
38 | posix_acl_release(*p_acl); | ||
39 | *p_acl = posix_acl_dup(acl); | ||
40 | spin_unlock(&inode->i_lock); | ||
41 | } | ||
42 | |||
43 | static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) | ||
44 | { | ||
45 | int size; | ||
46 | const char *name; | ||
47 | char *value = NULL; | ||
48 | struct posix_acl *acl = NULL, **p_acl; | ||
49 | |||
50 | switch (type) { | ||
51 | case ACL_TYPE_ACCESS: | ||
52 | name = POSIX_ACL_XATTR_ACCESS; | ||
53 | p_acl = &BTRFS_I(inode)->i_acl; | ||
54 | break; | ||
55 | case ACL_TYPE_DEFAULT: | ||
56 | name = POSIX_ACL_XATTR_DEFAULT; | ||
57 | p_acl = &BTRFS_I(inode)->i_default_acl; | ||
58 | break; | ||
59 | default: | ||
60 | return ERR_PTR(-EINVAL); | ||
61 | } | ||
62 | |||
63 | spin_lock(&inode->i_lock); | ||
64 | if (*p_acl != BTRFS_ACL_NOT_CACHED) | ||
65 | acl = posix_acl_dup(*p_acl); | ||
66 | spin_unlock(&inode->i_lock); | ||
67 | |||
68 | if (acl) | ||
69 | return acl; | ||
70 | |||
71 | |||
72 | size = __btrfs_getxattr(inode, name, "", 0); | ||
73 | if (size > 0) { | ||
74 | value = kzalloc(size, GFP_NOFS); | ||
75 | if (!value) | ||
76 | return ERR_PTR(-ENOMEM); | ||
77 | size = __btrfs_getxattr(inode, name, value, size); | ||
78 | if (size > 0) { | ||
79 | acl = posix_acl_from_xattr(value, size); | ||
80 | btrfs_update_cached_acl(inode, p_acl, acl); | ||
81 | } | ||
82 | kfree(value); | ||
83 | } else if (size == -ENOENT) { | ||
84 | acl = NULL; | ||
85 | btrfs_update_cached_acl(inode, p_acl, acl); | ||
86 | } | ||
87 | |||
88 | return acl; | ||
89 | } | ||
90 | |||
91 | static int btrfs_xattr_get_acl(struct inode *inode, int type, | ||
92 | void *value, size_t size) | ||
93 | { | ||
94 | struct posix_acl *acl; | ||
95 | int ret = 0; | ||
96 | |||
97 | acl = btrfs_get_acl(inode, type); | ||
98 | |||
99 | if (IS_ERR(acl)) | ||
100 | return PTR_ERR(acl); | ||
101 | if (acl == NULL) | ||
102 | return -ENODATA; | ||
103 | ret = posix_acl_to_xattr(acl, value, size); | ||
104 | posix_acl_release(acl); | ||
105 | |||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * Needs to be called with fs_mutex held | ||
111 | */ | ||
112 | static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) | ||
113 | { | ||
114 | int ret, size = 0; | ||
115 | const char *name; | ||
116 | struct posix_acl **p_acl; | ||
117 | char *value = NULL; | ||
118 | mode_t mode; | ||
119 | |||
120 | if (acl) { | ||
121 | ret = posix_acl_valid(acl); | ||
122 | if (ret < 0) | ||
123 | return ret; | ||
124 | ret = 0; | ||
125 | } | ||
126 | |||
127 | switch (type) { | ||
128 | case ACL_TYPE_ACCESS: | ||
129 | mode = inode->i_mode; | ||
130 | ret = posix_acl_equiv_mode(acl, &mode); | ||
131 | if (ret < 0) | ||
132 | return ret; | ||
133 | ret = 0; | ||
134 | inode->i_mode = mode; | ||
135 | name = POSIX_ACL_XATTR_ACCESS; | ||
136 | p_acl = &BTRFS_I(inode)->i_acl; | ||
137 | break; | ||
138 | case ACL_TYPE_DEFAULT: | ||
139 | if (!S_ISDIR(inode->i_mode)) | ||
140 | return acl ? -EINVAL : 0; | ||
141 | name = POSIX_ACL_XATTR_DEFAULT; | ||
142 | p_acl = &BTRFS_I(inode)->i_default_acl; | ||
143 | break; | ||
144 | default: | ||
145 | return -EINVAL; | ||
146 | } | ||
147 | |||
148 | if (acl) { | ||
149 | size = posix_acl_xattr_size(acl->a_count); | ||
150 | value = kmalloc(size, GFP_NOFS); | ||
151 | if (!value) { | ||
152 | ret = -ENOMEM; | ||
153 | goto out; | ||
154 | } | ||
155 | |||
156 | ret = posix_acl_to_xattr(acl, value, size); | ||
157 | if (ret < 0) | ||
158 | goto out; | ||
159 | } | ||
160 | |||
161 | ret = __btrfs_setxattr(inode, name, value, size, 0); | ||
162 | |||
163 | out: | ||
164 | kfree(value); | ||
165 | |||
166 | if (!ret) | ||
167 | btrfs_update_cached_acl(inode, p_acl, acl); | ||
168 | |||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | static int btrfs_xattr_set_acl(struct inode *inode, int type, | ||
173 | const void *value, size_t size) | ||
174 | { | ||
175 | int ret = 0; | ||
176 | struct posix_acl *acl = NULL; | ||
177 | |||
178 | if (value) { | ||
179 | acl = posix_acl_from_xattr(value, size); | ||
180 | if (acl == NULL) { | ||
181 | value = NULL; | ||
182 | size = 0; | ||
183 | } else if (IS_ERR(acl)) { | ||
184 | return PTR_ERR(acl); | ||
185 | } | ||
186 | } | ||
187 | |||
188 | ret = btrfs_set_acl(inode, acl, type); | ||
189 | |||
190 | posix_acl_release(acl); | ||
191 | |||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | |||
196 | static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, | ||
197 | void *value, size_t size) | ||
198 | { | ||
199 | return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); | ||
200 | } | ||
201 | |||
202 | static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, | ||
203 | const void *value, size_t size, int flags) | ||
204 | { | ||
205 | return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); | ||
206 | } | ||
207 | |||
208 | static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, | ||
209 | void *value, size_t size) | ||
210 | { | ||
211 | return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); | ||
212 | } | ||
213 | |||
214 | static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, | ||
215 | const void *value, size_t size, int flags) | ||
216 | { | ||
217 | return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); | ||
218 | } | ||
219 | |||
220 | int btrfs_check_acl(struct inode *inode, int mask) | ||
221 | { | ||
222 | struct posix_acl *acl; | ||
223 | int error = -EAGAIN; | ||
224 | |||
225 | acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); | ||
226 | |||
227 | if (IS_ERR(acl)) | ||
228 | return PTR_ERR(acl); | ||
229 | if (acl) { | ||
230 | error = posix_acl_permission(inode, acl, mask); | ||
231 | posix_acl_release(acl); | ||
232 | } | ||
233 | |||
234 | return error; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * btrfs_init_acl is already generally called under fs_mutex, so the locking | ||
239 | * stuff has been fixed to work with that. If the locking stuff changes, we | ||
240 | * need to re-evaluate the acl locking stuff. | ||
241 | */ | ||
242 | int btrfs_init_acl(struct inode *inode, struct inode *dir) | ||
243 | { | ||
244 | struct posix_acl *acl = NULL; | ||
245 | int ret = 0; | ||
246 | |||
247 | /* this happens with subvols */ | ||
248 | if (!dir) | ||
249 | return 0; | ||
250 | |||
251 | if (!S_ISLNK(inode->i_mode)) { | ||
252 | if (IS_POSIXACL(dir)) { | ||
253 | acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); | ||
254 | if (IS_ERR(acl)) | ||
255 | return PTR_ERR(acl); | ||
256 | } | ||
257 | |||
258 | if (!acl) | ||
259 | inode->i_mode &= ~current->fs->umask; | ||
260 | } | ||
261 | |||
262 | if (IS_POSIXACL(dir) && acl) { | ||
263 | struct posix_acl *clone; | ||
264 | mode_t mode; | ||
265 | |||
266 | if (S_ISDIR(inode->i_mode)) { | ||
267 | ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); | ||
268 | if (ret) | ||
269 | goto failed; | ||
270 | } | ||
271 | clone = posix_acl_clone(acl, GFP_NOFS); | ||
272 | ret = -ENOMEM; | ||
273 | if (!clone) | ||
274 | goto failed; | ||
275 | |||
276 | mode = inode->i_mode; | ||
277 | ret = posix_acl_create_masq(clone, &mode); | ||
278 | if (ret >= 0) { | ||
279 | inode->i_mode = mode; | ||
280 | if (ret > 0) { | ||
281 | /* we need an acl */ | ||
282 | ret = btrfs_set_acl(inode, clone, | ||
283 | ACL_TYPE_ACCESS); | ||
284 | } | ||
285 | } | ||
286 | } | ||
287 | failed: | ||
288 | posix_acl_release(acl); | ||
289 | |||
290 | return ret; | ||
291 | } | ||
292 | |||
293 | int btrfs_acl_chmod(struct inode *inode) | ||
294 | { | ||
295 | struct posix_acl *acl, *clone; | ||
296 | int ret = 0; | ||
297 | |||
298 | if (S_ISLNK(inode->i_mode)) | ||
299 | return -EOPNOTSUPP; | ||
300 | |||
301 | if (!IS_POSIXACL(inode)) | ||
302 | return 0; | ||
303 | |||
304 | acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); | ||
305 | if (IS_ERR(acl) || !acl) | ||
306 | return PTR_ERR(acl); | ||
307 | |||
308 | clone = posix_acl_clone(acl, GFP_KERNEL); | ||
309 | posix_acl_release(acl); | ||
310 | if (!clone) | ||
311 | return -ENOMEM; | ||
312 | |||
313 | ret = posix_acl_chmod_masq(clone, inode->i_mode); | ||
314 | if (!ret) | ||
315 | ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); | ||
316 | |||
317 | posix_acl_release(clone); | ||
318 | |||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | struct xattr_handler btrfs_xattr_acl_default_handler = { | ||
323 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
324 | .get = btrfs_xattr_acl_default_get, | ||
325 | .set = btrfs_xattr_acl_default_set, | ||
326 | }; | ||
327 | |||
328 | struct xattr_handler btrfs_xattr_acl_access_handler = { | ||
329 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
330 | .get = btrfs_xattr_acl_access_get, | ||
331 | .set = btrfs_xattr_acl_access_set, | ||
332 | }; | ||
333 | |||
334 | #else /* CONFIG_FS_POSIX_ACL */ | ||
335 | |||
336 | int btrfs_acl_chmod(struct inode *inode) | ||
337 | { | ||
338 | return 0; | ||
339 | } | ||
340 | |||
341 | int btrfs_init_acl(struct inode *inode, struct inode *dir) | ||
342 | { | ||
343 | return 0; | ||
344 | } | ||
345 | |||
346 | int btrfs_check_acl(struct inode *inode, int mask) | ||
347 | { | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | #endif /* CONFIG_FS_POSIX_ACL */ | ||
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..8e2fec05dbe0 --- /dev/null +++ b/fs/btrfs/async-thread.c | |||
@@ -0,0 +1,419 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/version.h> | ||
20 | #include <linux/kthread.h> | ||
21 | #include <linux/list.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | # include <linux/freezer.h> | ||
24 | #include "async-thread.h" | ||
25 | |||
26 | #define WORK_QUEUED_BIT 0 | ||
27 | #define WORK_DONE_BIT 1 | ||
28 | #define WORK_ORDER_DONE_BIT 2 | ||
29 | |||
30 | /* | ||
31 | * container for the kthread task pointer and the list of pending work | ||
32 | * One of these is allocated per thread. | ||
33 | */ | ||
34 | struct btrfs_worker_thread { | ||
35 | /* pool we belong to */ | ||
36 | struct btrfs_workers *workers; | ||
37 | |||
38 | /* list of struct btrfs_work that are waiting for service */ | ||
39 | struct list_head pending; | ||
40 | |||
41 | /* list of worker threads from struct btrfs_workers */ | ||
42 | struct list_head worker_list; | ||
43 | |||
44 | /* kthread */ | ||
45 | struct task_struct *task; | ||
46 | |||
47 | /* number of things on the pending list */ | ||
48 | atomic_t num_pending; | ||
49 | |||
50 | unsigned long sequence; | ||
51 | |||
52 | /* protects the pending list. */ | ||
53 | spinlock_t lock; | ||
54 | |||
55 | /* set to non-zero when this thread is already awake and kicking */ | ||
56 | int working; | ||
57 | |||
58 | /* are we currently idle */ | ||
59 | int idle; | ||
60 | }; | ||
61 | |||
62 | /* | ||
63 | * helper function to move a thread onto the idle list after it | ||
64 | * has finished some requests. | ||
65 | */ | ||
66 | static void check_idle_worker(struct btrfs_worker_thread *worker) | ||
67 | { | ||
68 | if (!worker->idle && atomic_read(&worker->num_pending) < | ||
69 | worker->workers->idle_thresh / 2) { | ||
70 | unsigned long flags; | ||
71 | spin_lock_irqsave(&worker->workers->lock, flags); | ||
72 | worker->idle = 1; | ||
73 | list_move(&worker->worker_list, &worker->workers->idle_list); | ||
74 | spin_unlock_irqrestore(&worker->workers->lock, flags); | ||
75 | } | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * helper function to move a thread off the idle list after new | ||
80 | * pending work is added. | ||
81 | */ | ||
82 | static void check_busy_worker(struct btrfs_worker_thread *worker) | ||
83 | { | ||
84 | if (worker->idle && atomic_read(&worker->num_pending) >= | ||
85 | worker->workers->idle_thresh) { | ||
86 | unsigned long flags; | ||
87 | spin_lock_irqsave(&worker->workers->lock, flags); | ||
88 | worker->idle = 0; | ||
89 | list_move_tail(&worker->worker_list, | ||
90 | &worker->workers->worker_list); | ||
91 | spin_unlock_irqrestore(&worker->workers->lock, flags); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | static noinline int run_ordered_completions(struct btrfs_workers *workers, | ||
96 | struct btrfs_work *work) | ||
97 | { | ||
98 | unsigned long flags; | ||
99 | |||
100 | if (!workers->ordered) | ||
101 | return 0; | ||
102 | |||
103 | set_bit(WORK_DONE_BIT, &work->flags); | ||
104 | |||
105 | spin_lock_irqsave(&workers->lock, flags); | ||
106 | |||
107 | while (!list_empty(&workers->order_list)) { | ||
108 | work = list_entry(workers->order_list.next, | ||
109 | struct btrfs_work, order_list); | ||
110 | |||
111 | if (!test_bit(WORK_DONE_BIT, &work->flags)) | ||
112 | break; | ||
113 | |||
114 | /* we are going to call the ordered done function, but | ||
115 | * we leave the work item on the list as a barrier so | ||
116 | * that later work items that are done don't have their | ||
117 | * functions called before this one returns | ||
118 | */ | ||
119 | if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) | ||
120 | break; | ||
121 | |||
122 | spin_unlock_irqrestore(&workers->lock, flags); | ||
123 | |||
124 | work->ordered_func(work); | ||
125 | |||
126 | /* now take the lock again and call the freeing code */ | ||
127 | spin_lock_irqsave(&workers->lock, flags); | ||
128 | list_del(&work->order_list); | ||
129 | work->ordered_free(work); | ||
130 | } | ||
131 | |||
132 | spin_unlock_irqrestore(&workers->lock, flags); | ||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * main loop for servicing work items | ||
138 | */ | ||
139 | static int worker_loop(void *arg) | ||
140 | { | ||
141 | struct btrfs_worker_thread *worker = arg; | ||
142 | struct list_head *cur; | ||
143 | struct btrfs_work *work; | ||
144 | do { | ||
145 | spin_lock_irq(&worker->lock); | ||
146 | while (!list_empty(&worker->pending)) { | ||
147 | cur = worker->pending.next; | ||
148 | work = list_entry(cur, struct btrfs_work, list); | ||
149 | list_del(&work->list); | ||
150 | clear_bit(WORK_QUEUED_BIT, &work->flags); | ||
151 | |||
152 | work->worker = worker; | ||
153 | spin_unlock_irq(&worker->lock); | ||
154 | |||
155 | work->func(work); | ||
156 | |||
157 | atomic_dec(&worker->num_pending); | ||
158 | /* | ||
159 | * unless this is an ordered work queue, | ||
160 | * 'work' was probably freed by func above. | ||
161 | */ | ||
162 | run_ordered_completions(worker->workers, work); | ||
163 | |||
164 | spin_lock_irq(&worker->lock); | ||
165 | check_idle_worker(worker); | ||
166 | |||
167 | } | ||
168 | worker->working = 0; | ||
169 | if (freezing(current)) { | ||
170 | refrigerator(); | ||
171 | } else { | ||
172 | set_current_state(TASK_INTERRUPTIBLE); | ||
173 | spin_unlock_irq(&worker->lock); | ||
174 | if (!kthread_should_stop()) | ||
175 | schedule(); | ||
176 | __set_current_state(TASK_RUNNING); | ||
177 | } | ||
178 | } while (!kthread_should_stop()); | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * this will wait for all the worker threads to shutdown | ||
184 | */ | ||
185 | int btrfs_stop_workers(struct btrfs_workers *workers) | ||
186 | { | ||
187 | struct list_head *cur; | ||
188 | struct btrfs_worker_thread *worker; | ||
189 | |||
190 | list_splice_init(&workers->idle_list, &workers->worker_list); | ||
191 | while (!list_empty(&workers->worker_list)) { | ||
192 | cur = workers->worker_list.next; | ||
193 | worker = list_entry(cur, struct btrfs_worker_thread, | ||
194 | worker_list); | ||
195 | kthread_stop(worker->task); | ||
196 | list_del(&worker->worker_list); | ||
197 | kfree(worker); | ||
198 | } | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * simple init on struct btrfs_workers | ||
204 | */ | ||
205 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) | ||
206 | { | ||
207 | workers->num_workers = 0; | ||
208 | INIT_LIST_HEAD(&workers->worker_list); | ||
209 | INIT_LIST_HEAD(&workers->idle_list); | ||
210 | INIT_LIST_HEAD(&workers->order_list); | ||
211 | spin_lock_init(&workers->lock); | ||
212 | workers->max_workers = max; | ||
213 | workers->idle_thresh = 32; | ||
214 | workers->name = name; | ||
215 | workers->ordered = 0; | ||
216 | } | ||
217 | |||
218 | /* | ||
219 | * starts new worker threads. This does not enforce the max worker | ||
220 | * count in case you need to temporarily go past it. | ||
221 | */ | ||
222 | int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) | ||
223 | { | ||
224 | struct btrfs_worker_thread *worker; | ||
225 | int ret = 0; | ||
226 | int i; | ||
227 | |||
228 | for (i = 0; i < num_workers; i++) { | ||
229 | worker = kzalloc(sizeof(*worker), GFP_NOFS); | ||
230 | if (!worker) { | ||
231 | ret = -ENOMEM; | ||
232 | goto fail; | ||
233 | } | ||
234 | |||
235 | INIT_LIST_HEAD(&worker->pending); | ||
236 | INIT_LIST_HEAD(&worker->worker_list); | ||
237 | spin_lock_init(&worker->lock); | ||
238 | atomic_set(&worker->num_pending, 0); | ||
239 | worker->task = kthread_run(worker_loop, worker, | ||
240 | "btrfs-%s-%d", workers->name, | ||
241 | workers->num_workers + i); | ||
242 | worker->workers = workers; | ||
243 | if (IS_ERR(worker->task)) { | ||
244 | kfree(worker); | ||
245 | ret = PTR_ERR(worker->task); | ||
246 | goto fail; | ||
247 | } | ||
248 | |||
249 | spin_lock_irq(&workers->lock); | ||
250 | list_add_tail(&worker->worker_list, &workers->idle_list); | ||
251 | worker->idle = 1; | ||
252 | workers->num_workers++; | ||
253 | spin_unlock_irq(&workers->lock); | ||
254 | } | ||
255 | return 0; | ||
256 | fail: | ||
257 | btrfs_stop_workers(workers); | ||
258 | return ret; | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * run through the list and find a worker thread that doesn't have a lot | ||
263 | * to do right now. This can return null if we aren't yet at the thread | ||
264 | * count limit and all of the threads are busy. | ||
265 | */ | ||
266 | static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) | ||
267 | { | ||
268 | struct btrfs_worker_thread *worker; | ||
269 | struct list_head *next; | ||
270 | int enforce_min = workers->num_workers < workers->max_workers; | ||
271 | |||
272 | /* | ||
273 | * if we find an idle thread, don't move it to the end of the | ||
274 | * idle list. This improves the chance that the next submission | ||
275 | * will reuse the same thread, and maybe catch it while it is still | ||
276 | * working | ||
277 | */ | ||
278 | if (!list_empty(&workers->idle_list)) { | ||
279 | next = workers->idle_list.next; | ||
280 | worker = list_entry(next, struct btrfs_worker_thread, | ||
281 | worker_list); | ||
282 | return worker; | ||
283 | } | ||
284 | if (enforce_min || list_empty(&workers->worker_list)) | ||
285 | return NULL; | ||
286 | |||
287 | /* | ||
288 | * if we pick a busy task, move the task to the end of the list. | ||
289 | * hopefully this will keep things somewhat evenly balanced. | ||
290 | * Do the move in batches based on the sequence number. This groups | ||
291 | * requests submitted at roughly the same time onto the same worker. | ||
292 | */ | ||
293 | next = workers->worker_list.next; | ||
294 | worker = list_entry(next, struct btrfs_worker_thread, worker_list); | ||
295 | atomic_inc(&worker->num_pending); | ||
296 | worker->sequence++; | ||
297 | |||
298 | if (worker->sequence % workers->idle_thresh == 0) | ||
299 | list_move_tail(next, &workers->worker_list); | ||
300 | return worker; | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * selects a worker thread to take the next job. This will either find | ||
305 | * an idle worker, start a new worker up to the max count, or just return | ||
306 | * one of the existing busy workers. | ||
307 | */ | ||
308 | static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) | ||
309 | { | ||
310 | struct btrfs_worker_thread *worker; | ||
311 | unsigned long flags; | ||
312 | |||
313 | again: | ||
314 | spin_lock_irqsave(&workers->lock, flags); | ||
315 | worker = next_worker(workers); | ||
316 | spin_unlock_irqrestore(&workers->lock, flags); | ||
317 | |||
318 | if (!worker) { | ||
319 | spin_lock_irqsave(&workers->lock, flags); | ||
320 | if (workers->num_workers >= workers->max_workers) { | ||
321 | struct list_head *fallback = NULL; | ||
322 | /* | ||
323 | * we have failed to find any workers, just | ||
324 | * return the force one | ||
325 | */ | ||
326 | if (!list_empty(&workers->worker_list)) | ||
327 | fallback = workers->worker_list.next; | ||
328 | if (!list_empty(&workers->idle_list)) | ||
329 | fallback = workers->idle_list.next; | ||
330 | BUG_ON(!fallback); | ||
331 | worker = list_entry(fallback, | ||
332 | struct btrfs_worker_thread, worker_list); | ||
333 | spin_unlock_irqrestore(&workers->lock, flags); | ||
334 | } else { | ||
335 | spin_unlock_irqrestore(&workers->lock, flags); | ||
336 | /* we're below the limit, start another worker */ | ||
337 | btrfs_start_workers(workers, 1); | ||
338 | goto again; | ||
339 | } | ||
340 | } | ||
341 | return worker; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * btrfs_requeue_work just puts the work item back on the tail of the list | ||
346 | * it was taken from. It is intended for use with long running work functions | ||
347 | * that make some progress and want to give the cpu up for others. | ||
348 | */ | ||
349 | int btrfs_requeue_work(struct btrfs_work *work) | ||
350 | { | ||
351 | struct btrfs_worker_thread *worker = work->worker; | ||
352 | unsigned long flags; | ||
353 | |||
354 | if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) | ||
355 | goto out; | ||
356 | |||
357 | spin_lock_irqsave(&worker->lock, flags); | ||
358 | atomic_inc(&worker->num_pending); | ||
359 | list_add_tail(&work->list, &worker->pending); | ||
360 | |||
361 | /* by definition we're busy, take ourselves off the idle | ||
362 | * list | ||
363 | */ | ||
364 | if (worker->idle) { | ||
365 | spin_lock_irqsave(&worker->workers->lock, flags); | ||
366 | worker->idle = 0; | ||
367 | list_move_tail(&worker->worker_list, | ||
368 | &worker->workers->worker_list); | ||
369 | spin_unlock_irqrestore(&worker->workers->lock, flags); | ||
370 | } | ||
371 | |||
372 | spin_unlock_irqrestore(&worker->lock, flags); | ||
373 | |||
374 | out: | ||
375 | return 0; | ||
376 | } | ||
377 | |||
378 | /* | ||
379 | * places a struct btrfs_work into the pending queue of one of the kthreads | ||
380 | */ | ||
381 | int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) | ||
382 | { | ||
383 | struct btrfs_worker_thread *worker; | ||
384 | unsigned long flags; | ||
385 | int wake = 0; | ||
386 | |||
387 | /* don't requeue something already on a list */ | ||
388 | if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) | ||
389 | goto out; | ||
390 | |||
391 | worker = find_worker(workers); | ||
392 | if (workers->ordered) { | ||
393 | spin_lock_irqsave(&workers->lock, flags); | ||
394 | list_add_tail(&work->order_list, &workers->order_list); | ||
395 | spin_unlock_irqrestore(&workers->lock, flags); | ||
396 | } else { | ||
397 | INIT_LIST_HEAD(&work->order_list); | ||
398 | } | ||
399 | |||
400 | spin_lock_irqsave(&worker->lock, flags); | ||
401 | atomic_inc(&worker->num_pending); | ||
402 | check_busy_worker(worker); | ||
403 | list_add_tail(&work->list, &worker->pending); | ||
404 | |||
405 | /* | ||
406 | * avoid calling into wake_up_process if this thread has already | ||
407 | * been kicked | ||
408 | */ | ||
409 | if (!worker->working) | ||
410 | wake = 1; | ||
411 | worker->working = 1; | ||
412 | |||
413 | spin_unlock_irqrestore(&worker->lock, flags); | ||
414 | |||
415 | if (wake) | ||
416 | wake_up_process(worker->task); | ||
417 | out: | ||
418 | return 0; | ||
419 | } | ||
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 000000000000..31be4ed8b63e --- /dev/null +++ b/fs/btrfs/async-thread.h | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_ASYNC_THREAD_ | ||
20 | #define __BTRFS_ASYNC_THREAD_ | ||
21 | |||
22 | struct btrfs_worker_thread; | ||
23 | |||
24 | /* | ||
25 | * This is similar to a workqueue, but it is meant to spread the operations | ||
26 | * across all available cpus instead of just the CPU that was used to | ||
27 | * queue the work. There is also some batching introduced to try and | ||
28 | * cut down on context switches. | ||
29 | * | ||
30 | * By default threads are added on demand up to 2 * the number of cpus. | ||
31 | * Changing struct btrfs_workers->max_workers is one way to prevent | ||
32 | * demand creation of kthreads. | ||
33 | * | ||
34 | * the basic model of these worker threads is to embed a btrfs_work | ||
35 | * structure in your own data struct, and use container_of in a | ||
36 | * work function to get back to your data struct. | ||
37 | */ | ||
38 | struct btrfs_work { | ||
39 | /* | ||
40 | * func should be set to the function you want called | ||
41 | * your work struct is passed as the only arg | ||
42 | * | ||
43 | * ordered_func must be set for work sent to an ordered work queue, | ||
44 | * and it is called to complete a given work item in the same | ||
45 | * order they were sent to the queue. | ||
46 | */ | ||
47 | void (*func)(struct btrfs_work *work); | ||
48 | void (*ordered_func)(struct btrfs_work *work); | ||
49 | void (*ordered_free)(struct btrfs_work *work); | ||
50 | |||
51 | /* | ||
52 | * flags should be set to zero. It is used to make sure the | ||
53 | * struct is only inserted once into the list. | ||
54 | */ | ||
55 | unsigned long flags; | ||
56 | |||
57 | /* don't touch these */ | ||
58 | struct btrfs_worker_thread *worker; | ||
59 | struct list_head list; | ||
60 | struct list_head order_list; | ||
61 | }; | ||
62 | |||
63 | struct btrfs_workers { | ||
64 | /* current number of running workers */ | ||
65 | int num_workers; | ||
66 | |||
67 | /* max number of workers allowed. changed by btrfs_start_workers */ | ||
68 | int max_workers; | ||
69 | |||
70 | /* once a worker has this many requests or fewer, it is idle */ | ||
71 | int idle_thresh; | ||
72 | |||
73 | /* force completions in the order they were queued */ | ||
74 | int ordered; | ||
75 | |||
76 | /* list with all the work threads. The workers on the idle thread | ||
77 | * may be actively servicing jobs, but they haven't yet hit the | ||
78 | * idle thresh limit above. | ||
79 | */ | ||
80 | struct list_head worker_list; | ||
81 | struct list_head idle_list; | ||
82 | |||
83 | /* | ||
84 | * when operating in ordered mode, this maintains the list | ||
85 | * of work items waiting for completion | ||
86 | */ | ||
87 | struct list_head order_list; | ||
88 | |||
89 | /* lock for finding the next worker thread to queue on */ | ||
90 | spinlock_t lock; | ||
91 | |||
92 | /* extra name for this worker, used for current->name */ | ||
93 | char *name; | ||
94 | }; | ||
95 | |||
96 | int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); | ||
97 | int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); | ||
98 | int btrfs_stop_workers(struct btrfs_workers *workers); | ||
99 | void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); | ||
100 | int btrfs_requeue_work(struct btrfs_work *work); | ||
101 | #endif | ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 000000000000..a8c9693b75ac --- /dev/null +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -0,0 +1,131 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_I__ | ||
20 | #define __BTRFS_I__ | ||
21 | |||
22 | #include "extent_map.h" | ||
23 | #include "extent_io.h" | ||
24 | #include "ordered-data.h" | ||
25 | |||
26 | /* in memory btrfs inode */ | ||
27 | struct btrfs_inode { | ||
28 | /* which subvolume this inode belongs to */ | ||
29 | struct btrfs_root *root; | ||
30 | |||
31 | /* key used to find this inode on disk. This is used by the code | ||
32 | * to read in roots of subvolumes | ||
33 | */ | ||
34 | struct btrfs_key location; | ||
35 | |||
36 | /* the extent_tree has caches of all the extent mappings to disk */ | ||
37 | struct extent_map_tree extent_tree; | ||
38 | |||
39 | /* the io_tree does range state (DIRTY, LOCKED etc) */ | ||
40 | struct extent_io_tree io_tree; | ||
41 | |||
42 | /* special utility tree used to record which mirrors have already been | ||
43 | * tried when checksums fail for a given block | ||
44 | */ | ||
45 | struct extent_io_tree io_failure_tree; | ||
46 | |||
47 | /* held while inesrting or deleting extents from files */ | ||
48 | struct mutex extent_mutex; | ||
49 | |||
50 | /* held while logging the inode in tree-log.c */ | ||
51 | struct mutex log_mutex; | ||
52 | |||
53 | /* used to order data wrt metadata */ | ||
54 | struct btrfs_ordered_inode_tree ordered_tree; | ||
55 | |||
56 | /* standard acl pointers */ | ||
57 | struct posix_acl *i_acl; | ||
58 | struct posix_acl *i_default_acl; | ||
59 | |||
60 | /* for keeping track of orphaned inodes */ | ||
61 | struct list_head i_orphan; | ||
62 | |||
63 | /* list of all the delalloc inodes in the FS. There are times we need | ||
64 | * to write all the delalloc pages to disk, and this list is used | ||
65 | * to walk them all. | ||
66 | */ | ||
67 | struct list_head delalloc_inodes; | ||
68 | |||
69 | /* full 64 bit generation number, struct vfs_inode doesn't have a big | ||
70 | * enough field for this. | ||
71 | */ | ||
72 | u64 generation; | ||
73 | |||
74 | /* sequence number for NFS changes */ | ||
75 | u64 sequence; | ||
76 | |||
77 | /* | ||
78 | * transid of the trans_handle that last modified this inode | ||
79 | */ | ||
80 | u64 last_trans; | ||
81 | /* | ||
82 | * transid that last logged this inode | ||
83 | */ | ||
84 | u64 logged_trans; | ||
85 | |||
86 | /* | ||
87 | * trans that last made a change that should be fully fsync'd. This | ||
88 | * gets reset to zero each time the inode is logged | ||
89 | */ | ||
90 | u64 log_dirty_trans; | ||
91 | |||
92 | /* total number of bytes pending delalloc, used by stat to calc the | ||
93 | * real block usage of the file | ||
94 | */ | ||
95 | u64 delalloc_bytes; | ||
96 | |||
97 | /* | ||
98 | * the size of the file stored in the metadata on disk. data=ordered | ||
99 | * means the in-memory i_size might be larger than the size on disk | ||
100 | * because not all the blocks are written yet. | ||
101 | */ | ||
102 | u64 disk_i_size; | ||
103 | |||
104 | /* flags field from the on disk inode */ | ||
105 | u32 flags; | ||
106 | |||
107 | /* | ||
108 | * if this is a directory then index_cnt is the counter for the index | ||
109 | * number for new files that are created | ||
110 | */ | ||
111 | u64 index_cnt; | ||
112 | |||
113 | /* the start of block group preferred for allocations. */ | ||
114 | u64 block_group; | ||
115 | |||
116 | struct inode vfs_inode; | ||
117 | }; | ||
118 | |||
119 | static inline struct btrfs_inode *BTRFS_I(struct inode *inode) | ||
120 | { | ||
121 | return container_of(inode, struct btrfs_inode, vfs_inode); | ||
122 | } | ||
123 | |||
124 | static inline void btrfs_i_size_write(struct inode *inode, u64 size) | ||
125 | { | ||
126 | inode->i_size = size; | ||
127 | BTRFS_I(inode)->disk_i_size = size; | ||
128 | } | ||
129 | |||
130 | |||
131 | #endif | ||
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 000000000000..7c4503ef6efd --- /dev/null +++ b/fs/btrfs/compat.h | |||
@@ -0,0 +1,7 @@ | |||
1 | #ifndef _COMPAT_H_ | ||
2 | #define _COMPAT_H_ | ||
3 | |||
4 | #define btrfs_drop_nlink(inode) drop_nlink(inode) | ||
5 | #define btrfs_inc_nlink(inode) inc_nlink(inode) | ||
6 | |||
7 | #endif /* _COMPAT_H_ */ | ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 000000000000..ee848d8585d9 --- /dev/null +++ b/fs/btrfs/compression.c | |||
@@ -0,0 +1,709 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/bio.h> | ||
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/file.h> | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/pagemap.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/backing-dev.h> | ||
31 | #include <linux/mpage.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/writeback.h> | ||
34 | #include <linux/bit_spinlock.h> | ||
35 | #include <linux/version.h> | ||
36 | #include <linux/pagevec.h> | ||
37 | #include "compat.h" | ||
38 | #include "ctree.h" | ||
39 | #include "disk-io.h" | ||
40 | #include "transaction.h" | ||
41 | #include "btrfs_inode.h" | ||
42 | #include "volumes.h" | ||
43 | #include "ordered-data.h" | ||
44 | #include "compression.h" | ||
45 | #include "extent_io.h" | ||
46 | #include "extent_map.h" | ||
47 | |||
48 | struct compressed_bio { | ||
49 | /* number of bios pending for this compressed extent */ | ||
50 | atomic_t pending_bios; | ||
51 | |||
52 | /* the pages with the compressed data on them */ | ||
53 | struct page **compressed_pages; | ||
54 | |||
55 | /* inode that owns this data */ | ||
56 | struct inode *inode; | ||
57 | |||
58 | /* starting offset in the inode for our pages */ | ||
59 | u64 start; | ||
60 | |||
61 | /* number of bytes in the inode we're working on */ | ||
62 | unsigned long len; | ||
63 | |||
64 | /* number of bytes on disk */ | ||
65 | unsigned long compressed_len; | ||
66 | |||
67 | /* number of compressed pages in the array */ | ||
68 | unsigned long nr_pages; | ||
69 | |||
70 | /* IO errors */ | ||
71 | int errors; | ||
72 | int mirror_num; | ||
73 | |||
74 | /* for reads, this is the bio we are copying the data into */ | ||
75 | struct bio *orig_bio; | ||
76 | |||
77 | /* | ||
78 | * the start of a variable length array of checksums only | ||
79 | * used by reads | ||
80 | */ | ||
81 | u32 sums; | ||
82 | }; | ||
83 | |||
84 | static inline int compressed_bio_size(struct btrfs_root *root, | ||
85 | unsigned long disk_size) | ||
86 | { | ||
87 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); | ||
88 | return sizeof(struct compressed_bio) + | ||
89 | ((disk_size + root->sectorsize - 1) / root->sectorsize) * | ||
90 | csum_size; | ||
91 | } | ||
92 | |||
93 | static struct bio *compressed_bio_alloc(struct block_device *bdev, | ||
94 | u64 first_byte, gfp_t gfp_flags) | ||
95 | { | ||
96 | struct bio *bio; | ||
97 | int nr_vecs; | ||
98 | |||
99 | nr_vecs = bio_get_nr_vecs(bdev); | ||
100 | bio = bio_alloc(gfp_flags, nr_vecs); | ||
101 | |||
102 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { | ||
103 | while (!bio && (nr_vecs /= 2)) | ||
104 | bio = bio_alloc(gfp_flags, nr_vecs); | ||
105 | } | ||
106 | |||
107 | if (bio) { | ||
108 | bio->bi_size = 0; | ||
109 | bio->bi_bdev = bdev; | ||
110 | bio->bi_sector = first_byte >> 9; | ||
111 | } | ||
112 | return bio; | ||
113 | } | ||
114 | |||
115 | static int check_compressed_csum(struct inode *inode, | ||
116 | struct compressed_bio *cb, | ||
117 | u64 disk_start) | ||
118 | { | ||
119 | int ret; | ||
120 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
121 | struct page *page; | ||
122 | unsigned long i; | ||
123 | char *kaddr; | ||
124 | u32 csum; | ||
125 | u32 *cb_sum = &cb->sums; | ||
126 | |||
127 | if (btrfs_test_flag(inode, NODATASUM)) | ||
128 | return 0; | ||
129 | |||
130 | for (i = 0; i < cb->nr_pages; i++) { | ||
131 | page = cb->compressed_pages[i]; | ||
132 | csum = ~(u32)0; | ||
133 | |||
134 | kaddr = kmap_atomic(page, KM_USER0); | ||
135 | csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); | ||
136 | btrfs_csum_final(csum, (char *)&csum); | ||
137 | kunmap_atomic(kaddr, KM_USER0); | ||
138 | |||
139 | if (csum != *cb_sum) { | ||
140 | printk(KERN_INFO "btrfs csum failed ino %lu " | ||
141 | "extent %llu csum %u " | ||
142 | "wanted %u mirror %d\n", inode->i_ino, | ||
143 | (unsigned long long)disk_start, | ||
144 | csum, *cb_sum, cb->mirror_num); | ||
145 | ret = -EIO; | ||
146 | goto fail; | ||
147 | } | ||
148 | cb_sum++; | ||
149 | |||
150 | } | ||
151 | ret = 0; | ||
152 | fail: | ||
153 | return ret; | ||
154 | } | ||
155 | |||
156 | /* when we finish reading compressed pages from the disk, we | ||
157 | * decompress them and then run the bio end_io routines on the | ||
158 | * decompressed pages (in the inode address space). | ||
159 | * | ||
160 | * This allows the checksumming and other IO error handling routines | ||
161 | * to work normally | ||
162 | * | ||
163 | * The compressed pages are freed here, and it must be run | ||
164 | * in process context | ||
165 | */ | ||
166 | static void end_compressed_bio_read(struct bio *bio, int err) | ||
167 | { | ||
168 | struct extent_io_tree *tree; | ||
169 | struct compressed_bio *cb = bio->bi_private; | ||
170 | struct inode *inode; | ||
171 | struct page *page; | ||
172 | unsigned long index; | ||
173 | int ret; | ||
174 | |||
175 | if (err) | ||
176 | cb->errors = 1; | ||
177 | |||
178 | /* if there are more bios still pending for this compressed | ||
179 | * extent, just exit | ||
180 | */ | ||
181 | if (!atomic_dec_and_test(&cb->pending_bios)) | ||
182 | goto out; | ||
183 | |||
184 | inode = cb->inode; | ||
185 | ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); | ||
186 | if (ret) | ||
187 | goto csum_failed; | ||
188 | |||
189 | /* ok, we're the last bio for this extent, lets start | ||
190 | * the decompression. | ||
191 | */ | ||
192 | tree = &BTRFS_I(inode)->io_tree; | ||
193 | ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, | ||
194 | cb->start, | ||
195 | cb->orig_bio->bi_io_vec, | ||
196 | cb->orig_bio->bi_vcnt, | ||
197 | cb->compressed_len); | ||
198 | csum_failed: | ||
199 | if (ret) | ||
200 | cb->errors = 1; | ||
201 | |||
202 | /* release the compressed pages */ | ||
203 | index = 0; | ||
204 | for (index = 0; index < cb->nr_pages; index++) { | ||
205 | page = cb->compressed_pages[index]; | ||
206 | page->mapping = NULL; | ||
207 | page_cache_release(page); | ||
208 | } | ||
209 | |||
210 | /* do io completion on the original bio */ | ||
211 | if (cb->errors) { | ||
212 | bio_io_error(cb->orig_bio); | ||
213 | } else { | ||
214 | int bio_index = 0; | ||
215 | struct bio_vec *bvec = cb->orig_bio->bi_io_vec; | ||
216 | |||
217 | /* | ||
218 | * we have verified the checksum already, set page | ||
219 | * checked so the end_io handlers know about it | ||
220 | */ | ||
221 | while (bio_index < cb->orig_bio->bi_vcnt) { | ||
222 | SetPageChecked(bvec->bv_page); | ||
223 | bvec++; | ||
224 | bio_index++; | ||
225 | } | ||
226 | bio_endio(cb->orig_bio, 0); | ||
227 | } | ||
228 | |||
229 | /* finally free the cb struct */ | ||
230 | kfree(cb->compressed_pages); | ||
231 | kfree(cb); | ||
232 | out: | ||
233 | bio_put(bio); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Clear the writeback bits on all of the file | ||
238 | * pages for a compressed write | ||
239 | */ | ||
240 | static noinline int end_compressed_writeback(struct inode *inode, u64 start, | ||
241 | unsigned long ram_size) | ||
242 | { | ||
243 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
244 | unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; | ||
245 | struct page *pages[16]; | ||
246 | unsigned long nr_pages = end_index - index + 1; | ||
247 | int i; | ||
248 | int ret; | ||
249 | |||
250 | while (nr_pages > 0) { | ||
251 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
252 | min_t(unsigned long, | ||
253 | nr_pages, ARRAY_SIZE(pages)), pages); | ||
254 | if (ret == 0) { | ||
255 | nr_pages -= 1; | ||
256 | index += 1; | ||
257 | continue; | ||
258 | } | ||
259 | for (i = 0; i < ret; i++) { | ||
260 | end_page_writeback(pages[i]); | ||
261 | page_cache_release(pages[i]); | ||
262 | } | ||
263 | nr_pages -= ret; | ||
264 | index += ret; | ||
265 | } | ||
266 | /* the inode may be gone now */ | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * do the cleanup once all the compressed pages hit the disk. | ||
272 | * This will clear writeback on the file pages and free the compressed | ||
273 | * pages. | ||
274 | * | ||
275 | * This also calls the writeback end hooks for the file pages so that | ||
276 | * metadata and checksums can be updated in the file. | ||
277 | */ | ||
278 | static void end_compressed_bio_write(struct bio *bio, int err) | ||
279 | { | ||
280 | struct extent_io_tree *tree; | ||
281 | struct compressed_bio *cb = bio->bi_private; | ||
282 | struct inode *inode; | ||
283 | struct page *page; | ||
284 | unsigned long index; | ||
285 | |||
286 | if (err) | ||
287 | cb->errors = 1; | ||
288 | |||
289 | /* if there are more bios still pending for this compressed | ||
290 | * extent, just exit | ||
291 | */ | ||
292 | if (!atomic_dec_and_test(&cb->pending_bios)) | ||
293 | goto out; | ||
294 | |||
295 | /* ok, we're the last bio for this extent, step one is to | ||
296 | * call back into the FS and do all the end_io operations | ||
297 | */ | ||
298 | inode = cb->inode; | ||
299 | tree = &BTRFS_I(inode)->io_tree; | ||
300 | cb->compressed_pages[0]->mapping = cb->inode->i_mapping; | ||
301 | tree->ops->writepage_end_io_hook(cb->compressed_pages[0], | ||
302 | cb->start, | ||
303 | cb->start + cb->len - 1, | ||
304 | NULL, 1); | ||
305 | cb->compressed_pages[0]->mapping = NULL; | ||
306 | |||
307 | end_compressed_writeback(inode, cb->start, cb->len); | ||
308 | /* note, our inode could be gone now */ | ||
309 | |||
310 | /* | ||
311 | * release the compressed pages, these came from alloc_page and | ||
312 | * are not attached to the inode at all | ||
313 | */ | ||
314 | index = 0; | ||
315 | for (index = 0; index < cb->nr_pages; index++) { | ||
316 | page = cb->compressed_pages[index]; | ||
317 | page->mapping = NULL; | ||
318 | page_cache_release(page); | ||
319 | } | ||
320 | |||
321 | /* finally free the cb struct */ | ||
322 | kfree(cb->compressed_pages); | ||
323 | kfree(cb); | ||
324 | out: | ||
325 | bio_put(bio); | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * worker function to build and submit bios for previously compressed pages. | ||
330 | * The corresponding pages in the inode should be marked for writeback | ||
331 | * and the compressed pages should have a reference on them for dropping | ||
332 | * when the IO is complete. | ||
333 | * | ||
334 | * This also checksums the file bytes and gets things ready for | ||
335 | * the end io hooks. | ||
336 | */ | ||
337 | int btrfs_submit_compressed_write(struct inode *inode, u64 start, | ||
338 | unsigned long len, u64 disk_start, | ||
339 | unsigned long compressed_len, | ||
340 | struct page **compressed_pages, | ||
341 | unsigned long nr_pages) | ||
342 | { | ||
343 | struct bio *bio = NULL; | ||
344 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
345 | struct compressed_bio *cb; | ||
346 | unsigned long bytes_left; | ||
347 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
348 | int page_index = 0; | ||
349 | struct page *page; | ||
350 | u64 first_byte = disk_start; | ||
351 | struct block_device *bdev; | ||
352 | int ret; | ||
353 | |||
354 | WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); | ||
355 | cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); | ||
356 | atomic_set(&cb->pending_bios, 0); | ||
357 | cb->errors = 0; | ||
358 | cb->inode = inode; | ||
359 | cb->start = start; | ||
360 | cb->len = len; | ||
361 | cb->mirror_num = 0; | ||
362 | cb->compressed_pages = compressed_pages; | ||
363 | cb->compressed_len = compressed_len; | ||
364 | cb->orig_bio = NULL; | ||
365 | cb->nr_pages = nr_pages; | ||
366 | |||
367 | bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
368 | |||
369 | bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); | ||
370 | bio->bi_private = cb; | ||
371 | bio->bi_end_io = end_compressed_bio_write; | ||
372 | atomic_inc(&cb->pending_bios); | ||
373 | |||
374 | /* create and submit bios for the compressed pages */ | ||
375 | bytes_left = compressed_len; | ||
376 | for (page_index = 0; page_index < cb->nr_pages; page_index++) { | ||
377 | page = compressed_pages[page_index]; | ||
378 | page->mapping = inode->i_mapping; | ||
379 | if (bio->bi_size) | ||
380 | ret = io_tree->ops->merge_bio_hook(page, 0, | ||
381 | PAGE_CACHE_SIZE, | ||
382 | bio, 0); | ||
383 | else | ||
384 | ret = 0; | ||
385 | |||
386 | page->mapping = NULL; | ||
387 | if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < | ||
388 | PAGE_CACHE_SIZE) { | ||
389 | bio_get(bio); | ||
390 | |||
391 | /* | ||
392 | * inc the count before we submit the bio so | ||
393 | * we know the end IO handler won't happen before | ||
394 | * we inc the count. Otherwise, the cb might get | ||
395 | * freed before we're done setting it up | ||
396 | */ | ||
397 | atomic_inc(&cb->pending_bios); | ||
398 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); | ||
399 | BUG_ON(ret); | ||
400 | |||
401 | ret = btrfs_csum_one_bio(root, inode, bio, start, 1); | ||
402 | BUG_ON(ret); | ||
403 | |||
404 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); | ||
405 | BUG_ON(ret); | ||
406 | |||
407 | bio_put(bio); | ||
408 | |||
409 | bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); | ||
410 | bio->bi_private = cb; | ||
411 | bio->bi_end_io = end_compressed_bio_write; | ||
412 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
413 | } | ||
414 | if (bytes_left < PAGE_CACHE_SIZE) { | ||
415 | printk("bytes left %lu compress len %lu nr %lu\n", | ||
416 | bytes_left, cb->compressed_len, cb->nr_pages); | ||
417 | } | ||
418 | bytes_left -= PAGE_CACHE_SIZE; | ||
419 | first_byte += PAGE_CACHE_SIZE; | ||
420 | cond_resched(); | ||
421 | } | ||
422 | bio_get(bio); | ||
423 | |||
424 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); | ||
425 | BUG_ON(ret); | ||
426 | |||
427 | ret = btrfs_csum_one_bio(root, inode, bio, start, 1); | ||
428 | BUG_ON(ret); | ||
429 | |||
430 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); | ||
431 | BUG_ON(ret); | ||
432 | |||
433 | bio_put(bio); | ||
434 | return 0; | ||
435 | } | ||
436 | |||
437 | static noinline int add_ra_bio_pages(struct inode *inode, | ||
438 | u64 compressed_end, | ||
439 | struct compressed_bio *cb) | ||
440 | { | ||
441 | unsigned long end_index; | ||
442 | unsigned long page_index; | ||
443 | u64 last_offset; | ||
444 | u64 isize = i_size_read(inode); | ||
445 | int ret; | ||
446 | struct page *page; | ||
447 | unsigned long nr_pages = 0; | ||
448 | struct extent_map *em; | ||
449 | struct address_space *mapping = inode->i_mapping; | ||
450 | struct pagevec pvec; | ||
451 | struct extent_map_tree *em_tree; | ||
452 | struct extent_io_tree *tree; | ||
453 | u64 end; | ||
454 | int misses = 0; | ||
455 | |||
456 | page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; | ||
457 | last_offset = (page_offset(page) + PAGE_CACHE_SIZE); | ||
458 | em_tree = &BTRFS_I(inode)->extent_tree; | ||
459 | tree = &BTRFS_I(inode)->io_tree; | ||
460 | |||
461 | if (isize == 0) | ||
462 | return 0; | ||
463 | |||
464 | end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; | ||
465 | |||
466 | pagevec_init(&pvec, 0); | ||
467 | while (last_offset < compressed_end) { | ||
468 | page_index = last_offset >> PAGE_CACHE_SHIFT; | ||
469 | |||
470 | if (page_index > end_index) | ||
471 | break; | ||
472 | |||
473 | rcu_read_lock(); | ||
474 | page = radix_tree_lookup(&mapping->page_tree, page_index); | ||
475 | rcu_read_unlock(); | ||
476 | if (page) { | ||
477 | misses++; | ||
478 | if (misses > 4) | ||
479 | break; | ||
480 | goto next; | ||
481 | } | ||
482 | |||
483 | page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); | ||
484 | if (!page) | ||
485 | break; | ||
486 | |||
487 | page->index = page_index; | ||
488 | /* | ||
489 | * what we want to do here is call add_to_page_cache_lru, | ||
490 | * but that isn't exported, so we reproduce it here | ||
491 | */ | ||
492 | if (add_to_page_cache(page, mapping, | ||
493 | page->index, GFP_NOFS)) { | ||
494 | page_cache_release(page); | ||
495 | goto next; | ||
496 | } | ||
497 | |||
498 | /* open coding of lru_cache_add, also not exported */ | ||
499 | page_cache_get(page); | ||
500 | if (!pagevec_add(&pvec, page)) | ||
501 | __pagevec_lru_add_file(&pvec); | ||
502 | |||
503 | end = last_offset + PAGE_CACHE_SIZE - 1; | ||
504 | /* | ||
505 | * at this point, we have a locked page in the page cache | ||
506 | * for these bytes in the file. But, we have to make | ||
507 | * sure they map to this compressed extent on disk. | ||
508 | */ | ||
509 | set_page_extent_mapped(page); | ||
510 | lock_extent(tree, last_offset, end, GFP_NOFS); | ||
511 | spin_lock(&em_tree->lock); | ||
512 | em = lookup_extent_mapping(em_tree, last_offset, | ||
513 | PAGE_CACHE_SIZE); | ||
514 | spin_unlock(&em_tree->lock); | ||
515 | |||
516 | if (!em || last_offset < em->start || | ||
517 | (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || | ||
518 | (em->block_start >> 9) != cb->orig_bio->bi_sector) { | ||
519 | free_extent_map(em); | ||
520 | unlock_extent(tree, last_offset, end, GFP_NOFS); | ||
521 | unlock_page(page); | ||
522 | page_cache_release(page); | ||
523 | break; | ||
524 | } | ||
525 | free_extent_map(em); | ||
526 | |||
527 | if (page->index == end_index) { | ||
528 | char *userpage; | ||
529 | size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); | ||
530 | |||
531 | if (zero_offset) { | ||
532 | int zeros; | ||
533 | zeros = PAGE_CACHE_SIZE - zero_offset; | ||
534 | userpage = kmap_atomic(page, KM_USER0); | ||
535 | memset(userpage + zero_offset, 0, zeros); | ||
536 | flush_dcache_page(page); | ||
537 | kunmap_atomic(userpage, KM_USER0); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | ret = bio_add_page(cb->orig_bio, page, | ||
542 | PAGE_CACHE_SIZE, 0); | ||
543 | |||
544 | if (ret == PAGE_CACHE_SIZE) { | ||
545 | nr_pages++; | ||
546 | page_cache_release(page); | ||
547 | } else { | ||
548 | unlock_extent(tree, last_offset, end, GFP_NOFS); | ||
549 | unlock_page(page); | ||
550 | page_cache_release(page); | ||
551 | break; | ||
552 | } | ||
553 | next: | ||
554 | last_offset += PAGE_CACHE_SIZE; | ||
555 | } | ||
556 | if (pagevec_count(&pvec)) | ||
557 | __pagevec_lru_add_file(&pvec); | ||
558 | return 0; | ||
559 | } | ||
560 | |||
561 | /* | ||
562 | * for a compressed read, the bio we get passed has all the inode pages | ||
563 | * in it. We don't actually do IO on those pages but allocate new ones | ||
564 | * to hold the compressed pages on disk. | ||
565 | * | ||
566 | * bio->bi_sector points to the compressed extent on disk | ||
567 | * bio->bi_io_vec points to all of the inode pages | ||
568 | * bio->bi_vcnt is a count of pages | ||
569 | * | ||
570 | * After the compressed pages are read, we copy the bytes into the | ||
571 | * bio we were passed and then call the bio end_io calls | ||
572 | */ | ||
573 | int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | ||
574 | int mirror_num, unsigned long bio_flags) | ||
575 | { | ||
576 | struct extent_io_tree *tree; | ||
577 | struct extent_map_tree *em_tree; | ||
578 | struct compressed_bio *cb; | ||
579 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
580 | unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; | ||
581 | unsigned long compressed_len; | ||
582 | unsigned long nr_pages; | ||
583 | unsigned long page_index; | ||
584 | struct page *page; | ||
585 | struct block_device *bdev; | ||
586 | struct bio *comp_bio; | ||
587 | u64 cur_disk_byte = (u64)bio->bi_sector << 9; | ||
588 | u64 em_len; | ||
589 | u64 em_start; | ||
590 | struct extent_map *em; | ||
591 | int ret; | ||
592 | u32 *sums; | ||
593 | |||
594 | tree = &BTRFS_I(inode)->io_tree; | ||
595 | em_tree = &BTRFS_I(inode)->extent_tree; | ||
596 | |||
597 | /* we need the actual starting offset of this extent in the file */ | ||
598 | spin_lock(&em_tree->lock); | ||
599 | em = lookup_extent_mapping(em_tree, | ||
600 | page_offset(bio->bi_io_vec->bv_page), | ||
601 | PAGE_CACHE_SIZE); | ||
602 | spin_unlock(&em_tree->lock); | ||
603 | |||
604 | compressed_len = em->block_len; | ||
605 | cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); | ||
606 | atomic_set(&cb->pending_bios, 0); | ||
607 | cb->errors = 0; | ||
608 | cb->inode = inode; | ||
609 | cb->mirror_num = mirror_num; | ||
610 | sums = &cb->sums; | ||
611 | |||
612 | cb->start = em->orig_start; | ||
613 | em_len = em->len; | ||
614 | em_start = em->start; | ||
615 | |||
616 | free_extent_map(em); | ||
617 | em = NULL; | ||
618 | |||
619 | cb->len = uncompressed_len; | ||
620 | cb->compressed_len = compressed_len; | ||
621 | cb->orig_bio = bio; | ||
622 | |||
623 | nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / | ||
624 | PAGE_CACHE_SIZE; | ||
625 | cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, | ||
626 | GFP_NOFS); | ||
627 | bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
628 | |||
629 | for (page_index = 0; page_index < nr_pages; page_index++) { | ||
630 | cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | | ||
631 | __GFP_HIGHMEM); | ||
632 | } | ||
633 | cb->nr_pages = nr_pages; | ||
634 | |||
635 | add_ra_bio_pages(inode, em_start + em_len, cb); | ||
636 | |||
637 | /* include any pages we added in add_ra-bio_pages */ | ||
638 | uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; | ||
639 | cb->len = uncompressed_len; | ||
640 | |||
641 | comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); | ||
642 | comp_bio->bi_private = cb; | ||
643 | comp_bio->bi_end_io = end_compressed_bio_read; | ||
644 | atomic_inc(&cb->pending_bios); | ||
645 | |||
646 | for (page_index = 0; page_index < nr_pages; page_index++) { | ||
647 | page = cb->compressed_pages[page_index]; | ||
648 | page->mapping = inode->i_mapping; | ||
649 | page->index = em_start >> PAGE_CACHE_SHIFT; | ||
650 | |||
651 | if (comp_bio->bi_size) | ||
652 | ret = tree->ops->merge_bio_hook(page, 0, | ||
653 | PAGE_CACHE_SIZE, | ||
654 | comp_bio, 0); | ||
655 | else | ||
656 | ret = 0; | ||
657 | |||
658 | page->mapping = NULL; | ||
659 | if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < | ||
660 | PAGE_CACHE_SIZE) { | ||
661 | bio_get(comp_bio); | ||
662 | |||
663 | ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); | ||
664 | BUG_ON(ret); | ||
665 | |||
666 | /* | ||
667 | * inc the count before we submit the bio so | ||
668 | * we know the end IO handler won't happen before | ||
669 | * we inc the count. Otherwise, the cb might get | ||
670 | * freed before we're done setting it up | ||
671 | */ | ||
672 | atomic_inc(&cb->pending_bios); | ||
673 | |||
674 | if (!btrfs_test_flag(inode, NODATASUM)) { | ||
675 | btrfs_lookup_bio_sums(root, inode, comp_bio, | ||
676 | sums); | ||
677 | } | ||
678 | sums += (comp_bio->bi_size + root->sectorsize - 1) / | ||
679 | root->sectorsize; | ||
680 | |||
681 | ret = btrfs_map_bio(root, READ, comp_bio, | ||
682 | mirror_num, 0); | ||
683 | BUG_ON(ret); | ||
684 | |||
685 | bio_put(comp_bio); | ||
686 | |||
687 | comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, | ||
688 | GFP_NOFS); | ||
689 | comp_bio->bi_private = cb; | ||
690 | comp_bio->bi_end_io = end_compressed_bio_read; | ||
691 | |||
692 | bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); | ||
693 | } | ||
694 | cur_disk_byte += PAGE_CACHE_SIZE; | ||
695 | } | ||
696 | bio_get(comp_bio); | ||
697 | |||
698 | ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); | ||
699 | BUG_ON(ret); | ||
700 | |||
701 | if (!btrfs_test_flag(inode, NODATASUM)) | ||
702 | btrfs_lookup_bio_sums(root, inode, comp_bio, sums); | ||
703 | |||
704 | ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); | ||
705 | BUG_ON(ret); | ||
706 | |||
707 | bio_put(comp_bio); | ||
708 | return 0; | ||
709 | } | ||
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 000000000000..421f5b4aa715 --- /dev/null +++ b/fs/btrfs/compression.h | |||
@@ -0,0 +1,47 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_COMPRESSION_ | ||
20 | #define __BTRFS_COMPRESSION_ | ||
21 | |||
22 | int btrfs_zlib_decompress(unsigned char *data_in, | ||
23 | struct page *dest_page, | ||
24 | unsigned long start_byte, | ||
25 | size_t srclen, size_t destlen); | ||
26 | int btrfs_zlib_compress_pages(struct address_space *mapping, | ||
27 | u64 start, unsigned long len, | ||
28 | struct page **pages, | ||
29 | unsigned long nr_dest_pages, | ||
30 | unsigned long *out_pages, | ||
31 | unsigned long *total_in, | ||
32 | unsigned long *total_out, | ||
33 | unsigned long max_out); | ||
34 | int btrfs_zlib_decompress_biovec(struct page **pages_in, | ||
35 | u64 disk_start, | ||
36 | struct bio_vec *bvec, | ||
37 | int vcnt, | ||
38 | size_t srclen); | ||
39 | void btrfs_zlib_exit(void); | ||
40 | int btrfs_submit_compressed_write(struct inode *inode, u64 start, | ||
41 | unsigned long len, u64 disk_start, | ||
42 | unsigned long compressed_len, | ||
43 | struct page **compressed_pages, | ||
44 | unsigned long nr_pages); | ||
45 | int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | ||
46 | int mirror_num, unsigned long bio_flags); | ||
47 | #endif | ||
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h new file mode 100644 index 000000000000..6e1b3de36700 --- /dev/null +++ b/fs/btrfs/crc32c.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_CRC32C__ | ||
20 | #define __BTRFS_CRC32C__ | ||
21 | #include <linux/crc32c.h> | ||
22 | |||
23 | /* | ||
24 | * this file used to do more for selecting the HW version of crc32c, | ||
25 | * perhaps it will one day again soon. | ||
26 | */ | ||
27 | #define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) | ||
28 | #endif | ||
29 | |||
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 000000000000..9e46c0776816 --- /dev/null +++ b/fs/btrfs/ctree.c | |||
@@ -0,0 +1,3953 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007,2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include "ctree.h" | ||
21 | #include "disk-io.h" | ||
22 | #include "transaction.h" | ||
23 | #include "print-tree.h" | ||
24 | #include "locking.h" | ||
25 | |||
26 | static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root | ||
27 | *root, struct btrfs_path *path, int level); | ||
28 | static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root | ||
29 | *root, struct btrfs_key *ins_key, | ||
30 | struct btrfs_path *path, int data_size, int extend); | ||
31 | static int push_node_left(struct btrfs_trans_handle *trans, | ||
32 | struct btrfs_root *root, struct extent_buffer *dst, | ||
33 | struct extent_buffer *src, int empty); | ||
34 | static int balance_node_right(struct btrfs_trans_handle *trans, | ||
35 | struct btrfs_root *root, | ||
36 | struct extent_buffer *dst_buf, | ||
37 | struct extent_buffer *src_buf); | ||
38 | static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
39 | struct btrfs_path *path, int level, int slot); | ||
40 | |||
41 | inline void btrfs_init_path(struct btrfs_path *p) | ||
42 | { | ||
43 | memset(p, 0, sizeof(*p)); | ||
44 | } | ||
45 | |||
46 | struct btrfs_path *btrfs_alloc_path(void) | ||
47 | { | ||
48 | struct btrfs_path *path; | ||
49 | path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); | ||
50 | if (path) { | ||
51 | btrfs_init_path(path); | ||
52 | path->reada = 1; | ||
53 | } | ||
54 | return path; | ||
55 | } | ||
56 | |||
57 | /* this also releases the path */ | ||
58 | void btrfs_free_path(struct btrfs_path *p) | ||
59 | { | ||
60 | btrfs_release_path(NULL, p); | ||
61 | kmem_cache_free(btrfs_path_cachep, p); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * path release drops references on the extent buffers in the path | ||
66 | * and it drops any locks held by this path | ||
67 | * | ||
68 | * It is safe to call this on paths that no locks or extent buffers held. | ||
69 | */ | ||
70 | noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) | ||
71 | { | ||
72 | int i; | ||
73 | |||
74 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { | ||
75 | p->slots[i] = 0; | ||
76 | if (!p->nodes[i]) | ||
77 | continue; | ||
78 | if (p->locks[i]) { | ||
79 | btrfs_tree_unlock(p->nodes[i]); | ||
80 | p->locks[i] = 0; | ||
81 | } | ||
82 | free_extent_buffer(p->nodes[i]); | ||
83 | p->nodes[i] = NULL; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * safely gets a reference on the root node of a tree. A lock | ||
89 | * is not taken, so a concurrent writer may put a different node | ||
90 | * at the root of the tree. See btrfs_lock_root_node for the | ||
91 | * looping required. | ||
92 | * | ||
93 | * The extent buffer returned by this has a reference taken, so | ||
94 | * it won't disappear. It may stop being the root of the tree | ||
95 | * at any time because there are no locks held. | ||
96 | */ | ||
97 | struct extent_buffer *btrfs_root_node(struct btrfs_root *root) | ||
98 | { | ||
99 | struct extent_buffer *eb; | ||
100 | spin_lock(&root->node_lock); | ||
101 | eb = root->node; | ||
102 | extent_buffer_get(eb); | ||
103 | spin_unlock(&root->node_lock); | ||
104 | return eb; | ||
105 | } | ||
106 | |||
107 | /* loop around taking references on and locking the root node of the | ||
108 | * tree until you end up with a lock on the root. A locked buffer | ||
109 | * is returned, with a reference held. | ||
110 | */ | ||
111 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | ||
112 | { | ||
113 | struct extent_buffer *eb; | ||
114 | |||
115 | while (1) { | ||
116 | eb = btrfs_root_node(root); | ||
117 | btrfs_tree_lock(eb); | ||
118 | |||
119 | spin_lock(&root->node_lock); | ||
120 | if (eb == root->node) { | ||
121 | spin_unlock(&root->node_lock); | ||
122 | break; | ||
123 | } | ||
124 | spin_unlock(&root->node_lock); | ||
125 | |||
126 | btrfs_tree_unlock(eb); | ||
127 | free_extent_buffer(eb); | ||
128 | } | ||
129 | return eb; | ||
130 | } | ||
131 | |||
132 | /* cowonly root (everything not a reference counted cow subvolume), just get | ||
133 | * put onto a simple dirty list. transaction.c walks this to make sure they | ||
134 | * get properly updated on disk. | ||
135 | */ | ||
136 | static void add_root_to_dirty_list(struct btrfs_root *root) | ||
137 | { | ||
138 | if (root->track_dirty && list_empty(&root->dirty_list)) { | ||
139 | list_add(&root->dirty_list, | ||
140 | &root->fs_info->dirty_cowonly_roots); | ||
141 | } | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * used by snapshot creation to make a copy of a root for a tree with | ||
146 | * a given objectid. The buffer with the new root node is returned in | ||
147 | * cow_ret, and this func returns zero on success or a negative error code. | ||
148 | */ | ||
149 | int btrfs_copy_root(struct btrfs_trans_handle *trans, | ||
150 | struct btrfs_root *root, | ||
151 | struct extent_buffer *buf, | ||
152 | struct extent_buffer **cow_ret, u64 new_root_objectid) | ||
153 | { | ||
154 | struct extent_buffer *cow; | ||
155 | u32 nritems; | ||
156 | int ret = 0; | ||
157 | int level; | ||
158 | struct btrfs_root *new_root; | ||
159 | |||
160 | new_root = kmalloc(sizeof(*new_root), GFP_NOFS); | ||
161 | if (!new_root) | ||
162 | return -ENOMEM; | ||
163 | |||
164 | memcpy(new_root, root, sizeof(*new_root)); | ||
165 | new_root->root_key.objectid = new_root_objectid; | ||
166 | |||
167 | WARN_ON(root->ref_cows && trans->transid != | ||
168 | root->fs_info->running_transaction->transid); | ||
169 | WARN_ON(root->ref_cows && trans->transid != root->last_trans); | ||
170 | |||
171 | level = btrfs_header_level(buf); | ||
172 | nritems = btrfs_header_nritems(buf); | ||
173 | |||
174 | cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, | ||
175 | new_root_objectid, trans->transid, | ||
176 | level, buf->start, 0); | ||
177 | if (IS_ERR(cow)) { | ||
178 | kfree(new_root); | ||
179 | return PTR_ERR(cow); | ||
180 | } | ||
181 | |||
182 | copy_extent_buffer(cow, buf, 0, 0, cow->len); | ||
183 | btrfs_set_header_bytenr(cow, cow->start); | ||
184 | btrfs_set_header_generation(cow, trans->transid); | ||
185 | btrfs_set_header_owner(cow, new_root_objectid); | ||
186 | btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); | ||
187 | |||
188 | write_extent_buffer(cow, root->fs_info->fsid, | ||
189 | (unsigned long)btrfs_header_fsid(cow), | ||
190 | BTRFS_FSID_SIZE); | ||
191 | |||
192 | WARN_ON(btrfs_header_generation(buf) > trans->transid); | ||
193 | ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); | ||
194 | kfree(new_root); | ||
195 | |||
196 | if (ret) | ||
197 | return ret; | ||
198 | |||
199 | btrfs_mark_buffer_dirty(cow); | ||
200 | *cow_ret = cow; | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * does the dirty work in cow of a single block. The parent block (if | ||
206 | * supplied) is updated to point to the new cow copy. The new buffer is marked | ||
207 | * dirty and returned locked. If you modify the block it needs to be marked | ||
208 | * dirty again. | ||
209 | * | ||
210 | * search_start -- an allocation hint for the new block | ||
211 | * | ||
212 | * empty_size -- a hint that you plan on doing more cow. This is the size in | ||
213 | * bytes the allocator should try to find free next to the block it returns. | ||
214 | * This is just a hint and may be ignored by the allocator. | ||
215 | * | ||
216 | * prealloc_dest -- if you have already reserved a destination for the cow, | ||
217 | * this uses that block instead of allocating a new one. | ||
218 | * btrfs_alloc_reserved_extent is used to finish the allocation. | ||
219 | */ | ||
220 | static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, | ||
221 | struct btrfs_root *root, | ||
222 | struct extent_buffer *buf, | ||
223 | struct extent_buffer *parent, int parent_slot, | ||
224 | struct extent_buffer **cow_ret, | ||
225 | u64 search_start, u64 empty_size, | ||
226 | u64 prealloc_dest) | ||
227 | { | ||
228 | u64 parent_start; | ||
229 | struct extent_buffer *cow; | ||
230 | u32 nritems; | ||
231 | int ret = 0; | ||
232 | int level; | ||
233 | int unlock_orig = 0; | ||
234 | |||
235 | if (*cow_ret == buf) | ||
236 | unlock_orig = 1; | ||
237 | |||
238 | WARN_ON(!btrfs_tree_locked(buf)); | ||
239 | |||
240 | if (parent) | ||
241 | parent_start = parent->start; | ||
242 | else | ||
243 | parent_start = 0; | ||
244 | |||
245 | WARN_ON(root->ref_cows && trans->transid != | ||
246 | root->fs_info->running_transaction->transid); | ||
247 | WARN_ON(root->ref_cows && trans->transid != root->last_trans); | ||
248 | |||
249 | level = btrfs_header_level(buf); | ||
250 | nritems = btrfs_header_nritems(buf); | ||
251 | |||
252 | if (prealloc_dest) { | ||
253 | struct btrfs_key ins; | ||
254 | |||
255 | ins.objectid = prealloc_dest; | ||
256 | ins.offset = buf->len; | ||
257 | ins.type = BTRFS_EXTENT_ITEM_KEY; | ||
258 | |||
259 | ret = btrfs_alloc_reserved_extent(trans, root, parent_start, | ||
260 | root->root_key.objectid, | ||
261 | trans->transid, level, &ins); | ||
262 | BUG_ON(ret); | ||
263 | cow = btrfs_init_new_buffer(trans, root, prealloc_dest, | ||
264 | buf->len); | ||
265 | } else { | ||
266 | cow = btrfs_alloc_free_block(trans, root, buf->len, | ||
267 | parent_start, | ||
268 | root->root_key.objectid, | ||
269 | trans->transid, level, | ||
270 | search_start, empty_size); | ||
271 | } | ||
272 | if (IS_ERR(cow)) | ||
273 | return PTR_ERR(cow); | ||
274 | |||
275 | copy_extent_buffer(cow, buf, 0, 0, cow->len); | ||
276 | btrfs_set_header_bytenr(cow, cow->start); | ||
277 | btrfs_set_header_generation(cow, trans->transid); | ||
278 | btrfs_set_header_owner(cow, root->root_key.objectid); | ||
279 | btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); | ||
280 | |||
281 | write_extent_buffer(cow, root->fs_info->fsid, | ||
282 | (unsigned long)btrfs_header_fsid(cow), | ||
283 | BTRFS_FSID_SIZE); | ||
284 | |||
285 | WARN_ON(btrfs_header_generation(buf) > trans->transid); | ||
286 | if (btrfs_header_generation(buf) != trans->transid) { | ||
287 | u32 nr_extents; | ||
288 | ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); | ||
289 | if (ret) | ||
290 | return ret; | ||
291 | |||
292 | ret = btrfs_cache_ref(trans, root, buf, nr_extents); | ||
293 | WARN_ON(ret); | ||
294 | } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { | ||
295 | /* | ||
296 | * There are only two places that can drop reference to | ||
297 | * tree blocks owned by living reloc trees, one is here, | ||
298 | * the other place is btrfs_drop_subtree. In both places, | ||
299 | * we check reference count while tree block is locked. | ||
300 | * Furthermore, if reference count is one, it won't get | ||
301 | * increased by someone else. | ||
302 | */ | ||
303 | u32 refs; | ||
304 | ret = btrfs_lookup_extent_ref(trans, root, buf->start, | ||
305 | buf->len, &refs); | ||
306 | BUG_ON(ret); | ||
307 | if (refs == 1) { | ||
308 | ret = btrfs_update_ref(trans, root, buf, cow, | ||
309 | 0, nritems); | ||
310 | clean_tree_block(trans, root, buf); | ||
311 | } else { | ||
312 | ret = btrfs_inc_ref(trans, root, buf, cow, NULL); | ||
313 | } | ||
314 | BUG_ON(ret); | ||
315 | } else { | ||
316 | ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); | ||
317 | if (ret) | ||
318 | return ret; | ||
319 | clean_tree_block(trans, root, buf); | ||
320 | } | ||
321 | |||
322 | if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { | ||
323 | ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); | ||
324 | WARN_ON(ret); | ||
325 | } | ||
326 | |||
327 | if (buf == root->node) { | ||
328 | WARN_ON(parent && parent != buf); | ||
329 | |||
330 | spin_lock(&root->node_lock); | ||
331 | root->node = cow; | ||
332 | extent_buffer_get(cow); | ||
333 | spin_unlock(&root->node_lock); | ||
334 | |||
335 | if (buf != root->commit_root) { | ||
336 | btrfs_free_extent(trans, root, buf->start, | ||
337 | buf->len, buf->start, | ||
338 | root->root_key.objectid, | ||
339 | btrfs_header_generation(buf), | ||
340 | level, 1); | ||
341 | } | ||
342 | free_extent_buffer(buf); | ||
343 | add_root_to_dirty_list(root); | ||
344 | } else { | ||
345 | btrfs_set_node_blockptr(parent, parent_slot, | ||
346 | cow->start); | ||
347 | WARN_ON(trans->transid == 0); | ||
348 | btrfs_set_node_ptr_generation(parent, parent_slot, | ||
349 | trans->transid); | ||
350 | btrfs_mark_buffer_dirty(parent); | ||
351 | WARN_ON(btrfs_header_generation(parent) != trans->transid); | ||
352 | btrfs_free_extent(trans, root, buf->start, buf->len, | ||
353 | parent_start, btrfs_header_owner(parent), | ||
354 | btrfs_header_generation(parent), level, 1); | ||
355 | } | ||
356 | if (unlock_orig) | ||
357 | btrfs_tree_unlock(buf); | ||
358 | free_extent_buffer(buf); | ||
359 | btrfs_mark_buffer_dirty(cow); | ||
360 | *cow_ret = cow; | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * cows a single block, see __btrfs_cow_block for the real work. | ||
366 | * This version of it has extra checks so that a block isn't cow'd more than | ||
367 | * once per transaction, as long as it hasn't been written yet | ||
368 | */ | ||
369 | noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, | ||
370 | struct btrfs_root *root, struct extent_buffer *buf, | ||
371 | struct extent_buffer *parent, int parent_slot, | ||
372 | struct extent_buffer **cow_ret, u64 prealloc_dest) | ||
373 | { | ||
374 | u64 search_start; | ||
375 | int ret; | ||
376 | |||
377 | if (trans->transaction != root->fs_info->running_transaction) { | ||
378 | printk(KERN_CRIT "trans %llu running %llu\n", | ||
379 | (unsigned long long)trans->transid, | ||
380 | (unsigned long long) | ||
381 | root->fs_info->running_transaction->transid); | ||
382 | WARN_ON(1); | ||
383 | } | ||
384 | if (trans->transid != root->fs_info->generation) { | ||
385 | printk(KERN_CRIT "trans %llu running %llu\n", | ||
386 | (unsigned long long)trans->transid, | ||
387 | (unsigned long long)root->fs_info->generation); | ||
388 | WARN_ON(1); | ||
389 | } | ||
390 | |||
391 | spin_lock(&root->fs_info->hash_lock); | ||
392 | if (btrfs_header_generation(buf) == trans->transid && | ||
393 | btrfs_header_owner(buf) == root->root_key.objectid && | ||
394 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { | ||
395 | *cow_ret = buf; | ||
396 | spin_unlock(&root->fs_info->hash_lock); | ||
397 | WARN_ON(prealloc_dest); | ||
398 | return 0; | ||
399 | } | ||
400 | spin_unlock(&root->fs_info->hash_lock); | ||
401 | search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); | ||
402 | ret = __btrfs_cow_block(trans, root, buf, parent, | ||
403 | parent_slot, cow_ret, search_start, 0, | ||
404 | prealloc_dest); | ||
405 | return ret; | ||
406 | } | ||
407 | |||
408 | /* | ||
409 | * helper function for defrag to decide if two blocks pointed to by a | ||
410 | * node are actually close by | ||
411 | */ | ||
412 | static int close_blocks(u64 blocknr, u64 other, u32 blocksize) | ||
413 | { | ||
414 | if (blocknr < other && other - (blocknr + blocksize) < 32768) | ||
415 | return 1; | ||
416 | if (blocknr > other && blocknr - (other + blocksize) < 32768) | ||
417 | return 1; | ||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * compare two keys in a memcmp fashion | ||
423 | */ | ||
424 | static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) | ||
425 | { | ||
426 | struct btrfs_key k1; | ||
427 | |||
428 | btrfs_disk_key_to_cpu(&k1, disk); | ||
429 | |||
430 | if (k1.objectid > k2->objectid) | ||
431 | return 1; | ||
432 | if (k1.objectid < k2->objectid) | ||
433 | return -1; | ||
434 | if (k1.type > k2->type) | ||
435 | return 1; | ||
436 | if (k1.type < k2->type) | ||
437 | return -1; | ||
438 | if (k1.offset > k2->offset) | ||
439 | return 1; | ||
440 | if (k1.offset < k2->offset) | ||
441 | return -1; | ||
442 | return 0; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * same as comp_keys only with two btrfs_key's | ||
447 | */ | ||
448 | static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) | ||
449 | { | ||
450 | if (k1->objectid > k2->objectid) | ||
451 | return 1; | ||
452 | if (k1->objectid < k2->objectid) | ||
453 | return -1; | ||
454 | if (k1->type > k2->type) | ||
455 | return 1; | ||
456 | if (k1->type < k2->type) | ||
457 | return -1; | ||
458 | if (k1->offset > k2->offset) | ||
459 | return 1; | ||
460 | if (k1->offset < k2->offset) | ||
461 | return -1; | ||
462 | return 0; | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * this is used by the defrag code to go through all the | ||
467 | * leaves pointed to by a node and reallocate them so that | ||
468 | * disk order is close to key order | ||
469 | */ | ||
470 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | ||
471 | struct btrfs_root *root, struct extent_buffer *parent, | ||
472 | int start_slot, int cache_only, u64 *last_ret, | ||
473 | struct btrfs_key *progress) | ||
474 | { | ||
475 | struct extent_buffer *cur; | ||
476 | u64 blocknr; | ||
477 | u64 gen; | ||
478 | u64 search_start = *last_ret; | ||
479 | u64 last_block = 0; | ||
480 | u64 other; | ||
481 | u32 parent_nritems; | ||
482 | int end_slot; | ||
483 | int i; | ||
484 | int err = 0; | ||
485 | int parent_level; | ||
486 | int uptodate; | ||
487 | u32 blocksize; | ||
488 | int progress_passed = 0; | ||
489 | struct btrfs_disk_key disk_key; | ||
490 | |||
491 | parent_level = btrfs_header_level(parent); | ||
492 | if (cache_only && parent_level != 1) | ||
493 | return 0; | ||
494 | |||
495 | if (trans->transaction != root->fs_info->running_transaction) | ||
496 | WARN_ON(1); | ||
497 | if (trans->transid != root->fs_info->generation) | ||
498 | WARN_ON(1); | ||
499 | |||
500 | parent_nritems = btrfs_header_nritems(parent); | ||
501 | blocksize = btrfs_level_size(root, parent_level - 1); | ||
502 | end_slot = parent_nritems; | ||
503 | |||
504 | if (parent_nritems == 1) | ||
505 | return 0; | ||
506 | |||
507 | for (i = start_slot; i < end_slot; i++) { | ||
508 | int close = 1; | ||
509 | |||
510 | if (!parent->map_token) { | ||
511 | map_extent_buffer(parent, | ||
512 | btrfs_node_key_ptr_offset(i), | ||
513 | sizeof(struct btrfs_key_ptr), | ||
514 | &parent->map_token, &parent->kaddr, | ||
515 | &parent->map_start, &parent->map_len, | ||
516 | KM_USER1); | ||
517 | } | ||
518 | btrfs_node_key(parent, &disk_key, i); | ||
519 | if (!progress_passed && comp_keys(&disk_key, progress) < 0) | ||
520 | continue; | ||
521 | |||
522 | progress_passed = 1; | ||
523 | blocknr = btrfs_node_blockptr(parent, i); | ||
524 | gen = btrfs_node_ptr_generation(parent, i); | ||
525 | if (last_block == 0) | ||
526 | last_block = blocknr; | ||
527 | |||
528 | if (i > 0) { | ||
529 | other = btrfs_node_blockptr(parent, i - 1); | ||
530 | close = close_blocks(blocknr, other, blocksize); | ||
531 | } | ||
532 | if (!close && i < end_slot - 2) { | ||
533 | other = btrfs_node_blockptr(parent, i + 1); | ||
534 | close = close_blocks(blocknr, other, blocksize); | ||
535 | } | ||
536 | if (close) { | ||
537 | last_block = blocknr; | ||
538 | continue; | ||
539 | } | ||
540 | if (parent->map_token) { | ||
541 | unmap_extent_buffer(parent, parent->map_token, | ||
542 | KM_USER1); | ||
543 | parent->map_token = NULL; | ||
544 | } | ||
545 | |||
546 | cur = btrfs_find_tree_block(root, blocknr, blocksize); | ||
547 | if (cur) | ||
548 | uptodate = btrfs_buffer_uptodate(cur, gen); | ||
549 | else | ||
550 | uptodate = 0; | ||
551 | if (!cur || !uptodate) { | ||
552 | if (cache_only) { | ||
553 | free_extent_buffer(cur); | ||
554 | continue; | ||
555 | } | ||
556 | if (!cur) { | ||
557 | cur = read_tree_block(root, blocknr, | ||
558 | blocksize, gen); | ||
559 | } else if (!uptodate) { | ||
560 | btrfs_read_buffer(cur, gen); | ||
561 | } | ||
562 | } | ||
563 | if (search_start == 0) | ||
564 | search_start = last_block; | ||
565 | |||
566 | btrfs_tree_lock(cur); | ||
567 | err = __btrfs_cow_block(trans, root, cur, parent, i, | ||
568 | &cur, search_start, | ||
569 | min(16 * blocksize, | ||
570 | (end_slot - i) * blocksize), 0); | ||
571 | if (err) { | ||
572 | btrfs_tree_unlock(cur); | ||
573 | free_extent_buffer(cur); | ||
574 | break; | ||
575 | } | ||
576 | search_start = cur->start; | ||
577 | last_block = cur->start; | ||
578 | *last_ret = search_start; | ||
579 | btrfs_tree_unlock(cur); | ||
580 | free_extent_buffer(cur); | ||
581 | } | ||
582 | if (parent->map_token) { | ||
583 | unmap_extent_buffer(parent, parent->map_token, | ||
584 | KM_USER1); | ||
585 | parent->map_token = NULL; | ||
586 | } | ||
587 | return err; | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * The leaf data grows from end-to-front in the node. | ||
592 | * this returns the address of the start of the last item, | ||
593 | * which is the stop of the leaf data stack | ||
594 | */ | ||
595 | static inline unsigned int leaf_data_end(struct btrfs_root *root, | ||
596 | struct extent_buffer *leaf) | ||
597 | { | ||
598 | u32 nr = btrfs_header_nritems(leaf); | ||
599 | if (nr == 0) | ||
600 | return BTRFS_LEAF_DATA_SIZE(root); | ||
601 | return btrfs_item_offset_nr(leaf, nr - 1); | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * extra debugging checks to make sure all the items in a key are | ||
606 | * well formed and in the proper order | ||
607 | */ | ||
608 | static int check_node(struct btrfs_root *root, struct btrfs_path *path, | ||
609 | int level) | ||
610 | { | ||
611 | struct extent_buffer *parent = NULL; | ||
612 | struct extent_buffer *node = path->nodes[level]; | ||
613 | struct btrfs_disk_key parent_key; | ||
614 | struct btrfs_disk_key node_key; | ||
615 | int parent_slot; | ||
616 | int slot; | ||
617 | struct btrfs_key cpukey; | ||
618 | u32 nritems = btrfs_header_nritems(node); | ||
619 | |||
620 | if (path->nodes[level + 1]) | ||
621 | parent = path->nodes[level + 1]; | ||
622 | |||
623 | slot = path->slots[level]; | ||
624 | BUG_ON(nritems == 0); | ||
625 | if (parent) { | ||
626 | parent_slot = path->slots[level + 1]; | ||
627 | btrfs_node_key(parent, &parent_key, parent_slot); | ||
628 | btrfs_node_key(node, &node_key, 0); | ||
629 | BUG_ON(memcmp(&parent_key, &node_key, | ||
630 | sizeof(struct btrfs_disk_key))); | ||
631 | BUG_ON(btrfs_node_blockptr(parent, parent_slot) != | ||
632 | btrfs_header_bytenr(node)); | ||
633 | } | ||
634 | BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); | ||
635 | if (slot != 0) { | ||
636 | btrfs_node_key_to_cpu(node, &cpukey, slot - 1); | ||
637 | btrfs_node_key(node, &node_key, slot); | ||
638 | BUG_ON(comp_keys(&node_key, &cpukey) <= 0); | ||
639 | } | ||
640 | if (slot < nritems - 1) { | ||
641 | btrfs_node_key_to_cpu(node, &cpukey, slot + 1); | ||
642 | btrfs_node_key(node, &node_key, slot); | ||
643 | BUG_ON(comp_keys(&node_key, &cpukey) >= 0); | ||
644 | } | ||
645 | return 0; | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * extra checking to make sure all the items in a leaf are | ||
650 | * well formed and in the proper order | ||
651 | */ | ||
652 | static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, | ||
653 | int level) | ||
654 | { | ||
655 | struct extent_buffer *leaf = path->nodes[level]; | ||
656 | struct extent_buffer *parent = NULL; | ||
657 | int parent_slot; | ||
658 | struct btrfs_key cpukey; | ||
659 | struct btrfs_disk_key parent_key; | ||
660 | struct btrfs_disk_key leaf_key; | ||
661 | int slot = path->slots[0]; | ||
662 | |||
663 | u32 nritems = btrfs_header_nritems(leaf); | ||
664 | |||
665 | if (path->nodes[level + 1]) | ||
666 | parent = path->nodes[level + 1]; | ||
667 | |||
668 | if (nritems == 0) | ||
669 | return 0; | ||
670 | |||
671 | if (parent) { | ||
672 | parent_slot = path->slots[level + 1]; | ||
673 | btrfs_node_key(parent, &parent_key, parent_slot); | ||
674 | btrfs_item_key(leaf, &leaf_key, 0); | ||
675 | |||
676 | BUG_ON(memcmp(&parent_key, &leaf_key, | ||
677 | sizeof(struct btrfs_disk_key))); | ||
678 | BUG_ON(btrfs_node_blockptr(parent, parent_slot) != | ||
679 | btrfs_header_bytenr(leaf)); | ||
680 | } | ||
681 | if (slot != 0 && slot < nritems - 1) { | ||
682 | btrfs_item_key(leaf, &leaf_key, slot); | ||
683 | btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); | ||
684 | if (comp_keys(&leaf_key, &cpukey) <= 0) { | ||
685 | btrfs_print_leaf(root, leaf); | ||
686 | printk(KERN_CRIT "slot %d offset bad key\n", slot); | ||
687 | BUG_ON(1); | ||
688 | } | ||
689 | if (btrfs_item_offset_nr(leaf, slot - 1) != | ||
690 | btrfs_item_end_nr(leaf, slot)) { | ||
691 | btrfs_print_leaf(root, leaf); | ||
692 | printk(KERN_CRIT "slot %d offset bad\n", slot); | ||
693 | BUG_ON(1); | ||
694 | } | ||
695 | } | ||
696 | if (slot < nritems - 1) { | ||
697 | btrfs_item_key(leaf, &leaf_key, slot); | ||
698 | btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); | ||
699 | BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); | ||
700 | if (btrfs_item_offset_nr(leaf, slot) != | ||
701 | btrfs_item_end_nr(leaf, slot + 1)) { | ||
702 | btrfs_print_leaf(root, leaf); | ||
703 | printk(KERN_CRIT "slot %d offset bad\n", slot); | ||
704 | BUG_ON(1); | ||
705 | } | ||
706 | } | ||
707 | BUG_ON(btrfs_item_offset_nr(leaf, 0) + | ||
708 | btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); | ||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | static noinline int check_block(struct btrfs_root *root, | ||
713 | struct btrfs_path *path, int level) | ||
714 | { | ||
715 | return 0; | ||
716 | if (level == 0) | ||
717 | return check_leaf(root, path, level); | ||
718 | return check_node(root, path, level); | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * search for key in the extent_buffer. The items start at offset p, | ||
723 | * and they are item_size apart. There are 'max' items in p. | ||
724 | * | ||
725 | * the slot in the array is returned via slot, and it points to | ||
726 | * the place where you would insert key if it is not found in | ||
727 | * the array. | ||
728 | * | ||
729 | * slot may point to max if the key is bigger than all of the keys | ||
730 | */ | ||
731 | static noinline int generic_bin_search(struct extent_buffer *eb, | ||
732 | unsigned long p, | ||
733 | int item_size, struct btrfs_key *key, | ||
734 | int max, int *slot) | ||
735 | { | ||
736 | int low = 0; | ||
737 | int high = max; | ||
738 | int mid; | ||
739 | int ret; | ||
740 | struct btrfs_disk_key *tmp = NULL; | ||
741 | struct btrfs_disk_key unaligned; | ||
742 | unsigned long offset; | ||
743 | char *map_token = NULL; | ||
744 | char *kaddr = NULL; | ||
745 | unsigned long map_start = 0; | ||
746 | unsigned long map_len = 0; | ||
747 | int err; | ||
748 | |||
749 | while (low < high) { | ||
750 | mid = (low + high) / 2; | ||
751 | offset = p + mid * item_size; | ||
752 | |||
753 | if (!map_token || offset < map_start || | ||
754 | (offset + sizeof(struct btrfs_disk_key)) > | ||
755 | map_start + map_len) { | ||
756 | if (map_token) { | ||
757 | unmap_extent_buffer(eb, map_token, KM_USER0); | ||
758 | map_token = NULL; | ||
759 | } | ||
760 | |||
761 | err = map_private_extent_buffer(eb, offset, | ||
762 | sizeof(struct btrfs_disk_key), | ||
763 | &map_token, &kaddr, | ||
764 | &map_start, &map_len, KM_USER0); | ||
765 | |||
766 | if (!err) { | ||
767 | tmp = (struct btrfs_disk_key *)(kaddr + offset - | ||
768 | map_start); | ||
769 | } else { | ||
770 | read_extent_buffer(eb, &unaligned, | ||
771 | offset, sizeof(unaligned)); | ||
772 | tmp = &unaligned; | ||
773 | } | ||
774 | |||
775 | } else { | ||
776 | tmp = (struct btrfs_disk_key *)(kaddr + offset - | ||
777 | map_start); | ||
778 | } | ||
779 | ret = comp_keys(tmp, key); | ||
780 | |||
781 | if (ret < 0) | ||
782 | low = mid + 1; | ||
783 | else if (ret > 0) | ||
784 | high = mid; | ||
785 | else { | ||
786 | *slot = mid; | ||
787 | if (map_token) | ||
788 | unmap_extent_buffer(eb, map_token, KM_USER0); | ||
789 | return 0; | ||
790 | } | ||
791 | } | ||
792 | *slot = low; | ||
793 | if (map_token) | ||
794 | unmap_extent_buffer(eb, map_token, KM_USER0); | ||
795 | return 1; | ||
796 | } | ||
797 | |||
798 | /* | ||
799 | * simple bin_search frontend that does the right thing for | ||
800 | * leaves vs nodes | ||
801 | */ | ||
802 | static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, | ||
803 | int level, int *slot) | ||
804 | { | ||
805 | if (level == 0) { | ||
806 | return generic_bin_search(eb, | ||
807 | offsetof(struct btrfs_leaf, items), | ||
808 | sizeof(struct btrfs_item), | ||
809 | key, btrfs_header_nritems(eb), | ||
810 | slot); | ||
811 | } else { | ||
812 | return generic_bin_search(eb, | ||
813 | offsetof(struct btrfs_node, ptrs), | ||
814 | sizeof(struct btrfs_key_ptr), | ||
815 | key, btrfs_header_nritems(eb), | ||
816 | slot); | ||
817 | } | ||
818 | return -1; | ||
819 | } | ||
820 | |||
821 | /* given a node and slot number, this reads the blocks it points to. The | ||
822 | * extent buffer is returned with a reference taken (but unlocked). | ||
823 | * NULL is returned on error. | ||
824 | */ | ||
825 | static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, | ||
826 | struct extent_buffer *parent, int slot) | ||
827 | { | ||
828 | int level = btrfs_header_level(parent); | ||
829 | if (slot < 0) | ||
830 | return NULL; | ||
831 | if (slot >= btrfs_header_nritems(parent)) | ||
832 | return NULL; | ||
833 | |||
834 | BUG_ON(level == 0); | ||
835 | |||
836 | return read_tree_block(root, btrfs_node_blockptr(parent, slot), | ||
837 | btrfs_level_size(root, level - 1), | ||
838 | btrfs_node_ptr_generation(parent, slot)); | ||
839 | } | ||
840 | |||
841 | /* | ||
842 | * node level balancing, used to make sure nodes are in proper order for | ||
843 | * item deletion. We balance from the top down, so we have to make sure | ||
844 | * that a deletion won't leave an node completely empty later on. | ||
845 | */ | ||
846 | static noinline int balance_level(struct btrfs_trans_handle *trans, | ||
847 | struct btrfs_root *root, | ||
848 | struct btrfs_path *path, int level) | ||
849 | { | ||
850 | struct extent_buffer *right = NULL; | ||
851 | struct extent_buffer *mid; | ||
852 | struct extent_buffer *left = NULL; | ||
853 | struct extent_buffer *parent = NULL; | ||
854 | int ret = 0; | ||
855 | int wret; | ||
856 | int pslot; | ||
857 | int orig_slot = path->slots[level]; | ||
858 | int err_on_enospc = 0; | ||
859 | u64 orig_ptr; | ||
860 | |||
861 | if (level == 0) | ||
862 | return 0; | ||
863 | |||
864 | mid = path->nodes[level]; | ||
865 | WARN_ON(!path->locks[level]); | ||
866 | WARN_ON(btrfs_header_generation(mid) != trans->transid); | ||
867 | |||
868 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); | ||
869 | |||
870 | if (level < BTRFS_MAX_LEVEL - 1) | ||
871 | parent = path->nodes[level + 1]; | ||
872 | pslot = path->slots[level + 1]; | ||
873 | |||
874 | /* | ||
875 | * deal with the case where there is only one pointer in the root | ||
876 | * by promoting the node below to a root | ||
877 | */ | ||
878 | if (!parent) { | ||
879 | struct extent_buffer *child; | ||
880 | |||
881 | if (btrfs_header_nritems(mid) != 1) | ||
882 | return 0; | ||
883 | |||
884 | /* promote the child to a root */ | ||
885 | child = read_node_slot(root, mid, 0); | ||
886 | btrfs_tree_lock(child); | ||
887 | BUG_ON(!child); | ||
888 | ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); | ||
889 | BUG_ON(ret); | ||
890 | |||
891 | spin_lock(&root->node_lock); | ||
892 | root->node = child; | ||
893 | spin_unlock(&root->node_lock); | ||
894 | |||
895 | ret = btrfs_update_extent_ref(trans, root, child->start, | ||
896 | mid->start, child->start, | ||
897 | root->root_key.objectid, | ||
898 | trans->transid, level - 1); | ||
899 | BUG_ON(ret); | ||
900 | |||
901 | add_root_to_dirty_list(root); | ||
902 | btrfs_tree_unlock(child); | ||
903 | path->locks[level] = 0; | ||
904 | path->nodes[level] = NULL; | ||
905 | clean_tree_block(trans, root, mid); | ||
906 | btrfs_tree_unlock(mid); | ||
907 | /* once for the path */ | ||
908 | free_extent_buffer(mid); | ||
909 | ret = btrfs_free_extent(trans, root, mid->start, mid->len, | ||
910 | mid->start, root->root_key.objectid, | ||
911 | btrfs_header_generation(mid), | ||
912 | level, 1); | ||
913 | /* once for the root ptr */ | ||
914 | free_extent_buffer(mid); | ||
915 | return ret; | ||
916 | } | ||
917 | if (btrfs_header_nritems(mid) > | ||
918 | BTRFS_NODEPTRS_PER_BLOCK(root) / 4) | ||
919 | return 0; | ||
920 | |||
921 | if (btrfs_header_nritems(mid) < 2) | ||
922 | err_on_enospc = 1; | ||
923 | |||
924 | left = read_node_slot(root, parent, pslot - 1); | ||
925 | if (left) { | ||
926 | btrfs_tree_lock(left); | ||
927 | wret = btrfs_cow_block(trans, root, left, | ||
928 | parent, pslot - 1, &left, 0); | ||
929 | if (wret) { | ||
930 | ret = wret; | ||
931 | goto enospc; | ||
932 | } | ||
933 | } | ||
934 | right = read_node_slot(root, parent, pslot + 1); | ||
935 | if (right) { | ||
936 | btrfs_tree_lock(right); | ||
937 | wret = btrfs_cow_block(trans, root, right, | ||
938 | parent, pslot + 1, &right, 0); | ||
939 | if (wret) { | ||
940 | ret = wret; | ||
941 | goto enospc; | ||
942 | } | ||
943 | } | ||
944 | |||
945 | /* first, try to make some room in the middle buffer */ | ||
946 | if (left) { | ||
947 | orig_slot += btrfs_header_nritems(left); | ||
948 | wret = push_node_left(trans, root, left, mid, 1); | ||
949 | if (wret < 0) | ||
950 | ret = wret; | ||
951 | if (btrfs_header_nritems(mid) < 2) | ||
952 | err_on_enospc = 1; | ||
953 | } | ||
954 | |||
955 | /* | ||
956 | * then try to empty the right most buffer into the middle | ||
957 | */ | ||
958 | if (right) { | ||
959 | wret = push_node_left(trans, root, mid, right, 1); | ||
960 | if (wret < 0 && wret != -ENOSPC) | ||
961 | ret = wret; | ||
962 | if (btrfs_header_nritems(right) == 0) { | ||
963 | u64 bytenr = right->start; | ||
964 | u64 generation = btrfs_header_generation(parent); | ||
965 | u32 blocksize = right->len; | ||
966 | |||
967 | clean_tree_block(trans, root, right); | ||
968 | btrfs_tree_unlock(right); | ||
969 | free_extent_buffer(right); | ||
970 | right = NULL; | ||
971 | wret = del_ptr(trans, root, path, level + 1, pslot + | ||
972 | 1); | ||
973 | if (wret) | ||
974 | ret = wret; | ||
975 | wret = btrfs_free_extent(trans, root, bytenr, | ||
976 | blocksize, parent->start, | ||
977 | btrfs_header_owner(parent), | ||
978 | generation, level, 1); | ||
979 | if (wret) | ||
980 | ret = wret; | ||
981 | } else { | ||
982 | struct btrfs_disk_key right_key; | ||
983 | btrfs_node_key(right, &right_key, 0); | ||
984 | btrfs_set_node_key(parent, &right_key, pslot + 1); | ||
985 | btrfs_mark_buffer_dirty(parent); | ||
986 | } | ||
987 | } | ||
988 | if (btrfs_header_nritems(mid) == 1) { | ||
989 | /* | ||
990 | * we're not allowed to leave a node with one item in the | ||
991 | * tree during a delete. A deletion from lower in the tree | ||
992 | * could try to delete the only pointer in this node. | ||
993 | * So, pull some keys from the left. | ||
994 | * There has to be a left pointer at this point because | ||
995 | * otherwise we would have pulled some pointers from the | ||
996 | * right | ||
997 | */ | ||
998 | BUG_ON(!left); | ||
999 | wret = balance_node_right(trans, root, mid, left); | ||
1000 | if (wret < 0) { | ||
1001 | ret = wret; | ||
1002 | goto enospc; | ||
1003 | } | ||
1004 | if (wret == 1) { | ||
1005 | wret = push_node_left(trans, root, left, mid, 1); | ||
1006 | if (wret < 0) | ||
1007 | ret = wret; | ||
1008 | } | ||
1009 | BUG_ON(wret == 1); | ||
1010 | } | ||
1011 | if (btrfs_header_nritems(mid) == 0) { | ||
1012 | /* we've managed to empty the middle node, drop it */ | ||
1013 | u64 root_gen = btrfs_header_generation(parent); | ||
1014 | u64 bytenr = mid->start; | ||
1015 | u32 blocksize = mid->len; | ||
1016 | |||
1017 | clean_tree_block(trans, root, mid); | ||
1018 | btrfs_tree_unlock(mid); | ||
1019 | free_extent_buffer(mid); | ||
1020 | mid = NULL; | ||
1021 | wret = del_ptr(trans, root, path, level + 1, pslot); | ||
1022 | if (wret) | ||
1023 | ret = wret; | ||
1024 | wret = btrfs_free_extent(trans, root, bytenr, blocksize, | ||
1025 | parent->start, | ||
1026 | btrfs_header_owner(parent), | ||
1027 | root_gen, level, 1); | ||
1028 | if (wret) | ||
1029 | ret = wret; | ||
1030 | } else { | ||
1031 | /* update the parent key to reflect our changes */ | ||
1032 | struct btrfs_disk_key mid_key; | ||
1033 | btrfs_node_key(mid, &mid_key, 0); | ||
1034 | btrfs_set_node_key(parent, &mid_key, pslot); | ||
1035 | btrfs_mark_buffer_dirty(parent); | ||
1036 | } | ||
1037 | |||
1038 | /* update the path */ | ||
1039 | if (left) { | ||
1040 | if (btrfs_header_nritems(left) > orig_slot) { | ||
1041 | extent_buffer_get(left); | ||
1042 | /* left was locked after cow */ | ||
1043 | path->nodes[level] = left; | ||
1044 | path->slots[level + 1] -= 1; | ||
1045 | path->slots[level] = orig_slot; | ||
1046 | if (mid) { | ||
1047 | btrfs_tree_unlock(mid); | ||
1048 | free_extent_buffer(mid); | ||
1049 | } | ||
1050 | } else { | ||
1051 | orig_slot -= btrfs_header_nritems(left); | ||
1052 | path->slots[level] = orig_slot; | ||
1053 | } | ||
1054 | } | ||
1055 | /* double check we haven't messed things up */ | ||
1056 | check_block(root, path, level); | ||
1057 | if (orig_ptr != | ||
1058 | btrfs_node_blockptr(path->nodes[level], path->slots[level])) | ||
1059 | BUG(); | ||
1060 | enospc: | ||
1061 | if (right) { | ||
1062 | btrfs_tree_unlock(right); | ||
1063 | free_extent_buffer(right); | ||
1064 | } | ||
1065 | if (left) { | ||
1066 | if (path->nodes[level] != left) | ||
1067 | btrfs_tree_unlock(left); | ||
1068 | free_extent_buffer(left); | ||
1069 | } | ||
1070 | return ret; | ||
1071 | } | ||
1072 | |||
1073 | /* Node balancing for insertion. Here we only split or push nodes around | ||
1074 | * when they are completely full. This is also done top down, so we | ||
1075 | * have to be pessimistic. | ||
1076 | */ | ||
1077 | static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, | ||
1078 | struct btrfs_root *root, | ||
1079 | struct btrfs_path *path, int level) | ||
1080 | { | ||
1081 | struct extent_buffer *right = NULL; | ||
1082 | struct extent_buffer *mid; | ||
1083 | struct extent_buffer *left = NULL; | ||
1084 | struct extent_buffer *parent = NULL; | ||
1085 | int ret = 0; | ||
1086 | int wret; | ||
1087 | int pslot; | ||
1088 | int orig_slot = path->slots[level]; | ||
1089 | u64 orig_ptr; | ||
1090 | |||
1091 | if (level == 0) | ||
1092 | return 1; | ||
1093 | |||
1094 | mid = path->nodes[level]; | ||
1095 | WARN_ON(btrfs_header_generation(mid) != trans->transid); | ||
1096 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); | ||
1097 | |||
1098 | if (level < BTRFS_MAX_LEVEL - 1) | ||
1099 | parent = path->nodes[level + 1]; | ||
1100 | pslot = path->slots[level + 1]; | ||
1101 | |||
1102 | if (!parent) | ||
1103 | return 1; | ||
1104 | |||
1105 | left = read_node_slot(root, parent, pslot - 1); | ||
1106 | |||
1107 | /* first, try to make some room in the middle buffer */ | ||
1108 | if (left) { | ||
1109 | u32 left_nr; | ||
1110 | |||
1111 | btrfs_tree_lock(left); | ||
1112 | left_nr = btrfs_header_nritems(left); | ||
1113 | if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { | ||
1114 | wret = 1; | ||
1115 | } else { | ||
1116 | ret = btrfs_cow_block(trans, root, left, parent, | ||
1117 | pslot - 1, &left, 0); | ||
1118 | if (ret) | ||
1119 | wret = 1; | ||
1120 | else { | ||
1121 | wret = push_node_left(trans, root, | ||
1122 | left, mid, 0); | ||
1123 | } | ||
1124 | } | ||
1125 | if (wret < 0) | ||
1126 | ret = wret; | ||
1127 | if (wret == 0) { | ||
1128 | struct btrfs_disk_key disk_key; | ||
1129 | orig_slot += left_nr; | ||
1130 | btrfs_node_key(mid, &disk_key, 0); | ||
1131 | btrfs_set_node_key(parent, &disk_key, pslot); | ||
1132 | btrfs_mark_buffer_dirty(parent); | ||
1133 | if (btrfs_header_nritems(left) > orig_slot) { | ||
1134 | path->nodes[level] = left; | ||
1135 | path->slots[level + 1] -= 1; | ||
1136 | path->slots[level] = orig_slot; | ||
1137 | btrfs_tree_unlock(mid); | ||
1138 | free_extent_buffer(mid); | ||
1139 | } else { | ||
1140 | orig_slot -= | ||
1141 | btrfs_header_nritems(left); | ||
1142 | path->slots[level] = orig_slot; | ||
1143 | btrfs_tree_unlock(left); | ||
1144 | free_extent_buffer(left); | ||
1145 | } | ||
1146 | return 0; | ||
1147 | } | ||
1148 | btrfs_tree_unlock(left); | ||
1149 | free_extent_buffer(left); | ||
1150 | } | ||
1151 | right = read_node_slot(root, parent, pslot + 1); | ||
1152 | |||
1153 | /* | ||
1154 | * then try to empty the right most buffer into the middle | ||
1155 | */ | ||
1156 | if (right) { | ||
1157 | u32 right_nr; | ||
1158 | btrfs_tree_lock(right); | ||
1159 | right_nr = btrfs_header_nritems(right); | ||
1160 | if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { | ||
1161 | wret = 1; | ||
1162 | } else { | ||
1163 | ret = btrfs_cow_block(trans, root, right, | ||
1164 | parent, pslot + 1, | ||
1165 | &right, 0); | ||
1166 | if (ret) | ||
1167 | wret = 1; | ||
1168 | else { | ||
1169 | wret = balance_node_right(trans, root, | ||
1170 | right, mid); | ||
1171 | } | ||
1172 | } | ||
1173 | if (wret < 0) | ||
1174 | ret = wret; | ||
1175 | if (wret == 0) { | ||
1176 | struct btrfs_disk_key disk_key; | ||
1177 | |||
1178 | btrfs_node_key(right, &disk_key, 0); | ||
1179 | btrfs_set_node_key(parent, &disk_key, pslot + 1); | ||
1180 | btrfs_mark_buffer_dirty(parent); | ||
1181 | |||
1182 | if (btrfs_header_nritems(mid) <= orig_slot) { | ||
1183 | path->nodes[level] = right; | ||
1184 | path->slots[level + 1] += 1; | ||
1185 | path->slots[level] = orig_slot - | ||
1186 | btrfs_header_nritems(mid); | ||
1187 | btrfs_tree_unlock(mid); | ||
1188 | free_extent_buffer(mid); | ||
1189 | } else { | ||
1190 | btrfs_tree_unlock(right); | ||
1191 | free_extent_buffer(right); | ||
1192 | } | ||
1193 | return 0; | ||
1194 | } | ||
1195 | btrfs_tree_unlock(right); | ||
1196 | free_extent_buffer(right); | ||
1197 | } | ||
1198 | return 1; | ||
1199 | } | ||
1200 | |||
1201 | /* | ||
1202 | * readahead one full node of leaves, finding things that are close | ||
1203 | * to the block in 'slot', and triggering ra on them. | ||
1204 | */ | ||
1205 | static noinline void reada_for_search(struct btrfs_root *root, | ||
1206 | struct btrfs_path *path, | ||
1207 | int level, int slot, u64 objectid) | ||
1208 | { | ||
1209 | struct extent_buffer *node; | ||
1210 | struct btrfs_disk_key disk_key; | ||
1211 | u32 nritems; | ||
1212 | u64 search; | ||
1213 | u64 lowest_read; | ||
1214 | u64 highest_read; | ||
1215 | u64 nread = 0; | ||
1216 | int direction = path->reada; | ||
1217 | struct extent_buffer *eb; | ||
1218 | u32 nr; | ||
1219 | u32 blocksize; | ||
1220 | u32 nscan = 0; | ||
1221 | |||
1222 | if (level != 1) | ||
1223 | return; | ||
1224 | |||
1225 | if (!path->nodes[level]) | ||
1226 | return; | ||
1227 | |||
1228 | node = path->nodes[level]; | ||
1229 | |||
1230 | search = btrfs_node_blockptr(node, slot); | ||
1231 | blocksize = btrfs_level_size(root, level - 1); | ||
1232 | eb = btrfs_find_tree_block(root, search, blocksize); | ||
1233 | if (eb) { | ||
1234 | free_extent_buffer(eb); | ||
1235 | return; | ||
1236 | } | ||
1237 | |||
1238 | highest_read = search; | ||
1239 | lowest_read = search; | ||
1240 | |||
1241 | nritems = btrfs_header_nritems(node); | ||
1242 | nr = slot; | ||
1243 | while (1) { | ||
1244 | if (direction < 0) { | ||
1245 | if (nr == 0) | ||
1246 | break; | ||
1247 | nr--; | ||
1248 | } else if (direction > 0) { | ||
1249 | nr++; | ||
1250 | if (nr >= nritems) | ||
1251 | break; | ||
1252 | } | ||
1253 | if (path->reada < 0 && objectid) { | ||
1254 | btrfs_node_key(node, &disk_key, nr); | ||
1255 | if (btrfs_disk_key_objectid(&disk_key) != objectid) | ||
1256 | break; | ||
1257 | } | ||
1258 | search = btrfs_node_blockptr(node, nr); | ||
1259 | if ((search >= lowest_read && search <= highest_read) || | ||
1260 | (search < lowest_read && lowest_read - search <= 16384) || | ||
1261 | (search > highest_read && search - highest_read <= 16384)) { | ||
1262 | readahead_tree_block(root, search, blocksize, | ||
1263 | btrfs_node_ptr_generation(node, nr)); | ||
1264 | nread += blocksize; | ||
1265 | } | ||
1266 | nscan++; | ||
1267 | if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) | ||
1268 | break; | ||
1269 | |||
1270 | if (nread > (256 * 1024) || nscan > 128) | ||
1271 | break; | ||
1272 | |||
1273 | if (search < lowest_read) | ||
1274 | lowest_read = search; | ||
1275 | if (search > highest_read) | ||
1276 | highest_read = search; | ||
1277 | } | ||
1278 | } | ||
1279 | |||
1280 | /* | ||
1281 | * when we walk down the tree, it is usually safe to unlock the higher layers | ||
1282 | * in the tree. The exceptions are when our path goes through slot 0, because | ||
1283 | * operations on the tree might require changing key pointers higher up in the | ||
1284 | * tree. | ||
1285 | * | ||
1286 | * callers might also have set path->keep_locks, which tells this code to keep | ||
1287 | * the lock if the path points to the last slot in the block. This is part of | ||
1288 | * walking through the tree, and selecting the next slot in the higher block. | ||
1289 | * | ||
1290 | * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so | ||
1291 | * if lowest_unlock is 1, level 0 won't be unlocked | ||
1292 | */ | ||
1293 | static noinline void unlock_up(struct btrfs_path *path, int level, | ||
1294 | int lowest_unlock) | ||
1295 | { | ||
1296 | int i; | ||
1297 | int skip_level = level; | ||
1298 | int no_skips = 0; | ||
1299 | struct extent_buffer *t; | ||
1300 | |||
1301 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | ||
1302 | if (!path->nodes[i]) | ||
1303 | break; | ||
1304 | if (!path->locks[i]) | ||
1305 | break; | ||
1306 | if (!no_skips && path->slots[i] == 0) { | ||
1307 | skip_level = i + 1; | ||
1308 | continue; | ||
1309 | } | ||
1310 | if (!no_skips && path->keep_locks) { | ||
1311 | u32 nritems; | ||
1312 | t = path->nodes[i]; | ||
1313 | nritems = btrfs_header_nritems(t); | ||
1314 | if (nritems < 1 || path->slots[i] >= nritems - 1) { | ||
1315 | skip_level = i + 1; | ||
1316 | continue; | ||
1317 | } | ||
1318 | } | ||
1319 | if (skip_level < i && i >= lowest_unlock) | ||
1320 | no_skips = 1; | ||
1321 | |||
1322 | t = path->nodes[i]; | ||
1323 | if (i >= lowest_unlock && i > skip_level && path->locks[i]) { | ||
1324 | btrfs_tree_unlock(t); | ||
1325 | path->locks[i] = 0; | ||
1326 | } | ||
1327 | } | ||
1328 | } | ||
1329 | |||
1330 | /* | ||
1331 | * look for key in the tree. path is filled in with nodes along the way | ||
1332 | * if key is found, we return zero and you can find the item in the leaf | ||
1333 | * level of the path (level 0) | ||
1334 | * | ||
1335 | * If the key isn't found, the path points to the slot where it should | ||
1336 | * be inserted, and 1 is returned. If there are other errors during the | ||
1337 | * search a negative error number is returned. | ||
1338 | * | ||
1339 | * if ins_len > 0, nodes and leaves will be split as we walk down the | ||
1340 | * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if | ||
1341 | * possible) | ||
1342 | */ | ||
1343 | int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1344 | *root, struct btrfs_key *key, struct btrfs_path *p, int | ||
1345 | ins_len, int cow) | ||
1346 | { | ||
1347 | struct extent_buffer *b; | ||
1348 | struct extent_buffer *tmp; | ||
1349 | int slot; | ||
1350 | int ret; | ||
1351 | int level; | ||
1352 | int should_reada = p->reada; | ||
1353 | int lowest_unlock = 1; | ||
1354 | int blocksize; | ||
1355 | u8 lowest_level = 0; | ||
1356 | u64 blocknr; | ||
1357 | u64 gen; | ||
1358 | struct btrfs_key prealloc_block; | ||
1359 | |||
1360 | lowest_level = p->lowest_level; | ||
1361 | WARN_ON(lowest_level && ins_len > 0); | ||
1362 | WARN_ON(p->nodes[0] != NULL); | ||
1363 | |||
1364 | if (ins_len < 0) | ||
1365 | lowest_unlock = 2; | ||
1366 | |||
1367 | prealloc_block.objectid = 0; | ||
1368 | |||
1369 | again: | ||
1370 | if (p->skip_locking) | ||
1371 | b = btrfs_root_node(root); | ||
1372 | else | ||
1373 | b = btrfs_lock_root_node(root); | ||
1374 | |||
1375 | while (b) { | ||
1376 | level = btrfs_header_level(b); | ||
1377 | |||
1378 | /* | ||
1379 | * setup the path here so we can release it under lock | ||
1380 | * contention with the cow code | ||
1381 | */ | ||
1382 | p->nodes[level] = b; | ||
1383 | if (!p->skip_locking) | ||
1384 | p->locks[level] = 1; | ||
1385 | |||
1386 | if (cow) { | ||
1387 | int wret; | ||
1388 | |||
1389 | /* is a cow on this block not required */ | ||
1390 | spin_lock(&root->fs_info->hash_lock); | ||
1391 | if (btrfs_header_generation(b) == trans->transid && | ||
1392 | btrfs_header_owner(b) == root->root_key.objectid && | ||
1393 | !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { | ||
1394 | spin_unlock(&root->fs_info->hash_lock); | ||
1395 | goto cow_done; | ||
1396 | } | ||
1397 | spin_unlock(&root->fs_info->hash_lock); | ||
1398 | |||
1399 | /* ok, we have to cow, is our old prealloc the right | ||
1400 | * size? | ||
1401 | */ | ||
1402 | if (prealloc_block.objectid && | ||
1403 | prealloc_block.offset != b->len) { | ||
1404 | btrfs_free_reserved_extent(root, | ||
1405 | prealloc_block.objectid, | ||
1406 | prealloc_block.offset); | ||
1407 | prealloc_block.objectid = 0; | ||
1408 | } | ||
1409 | |||
1410 | /* | ||
1411 | * for higher level blocks, try not to allocate blocks | ||
1412 | * with the block and the parent locks held. | ||
1413 | */ | ||
1414 | if (level > 1 && !prealloc_block.objectid && | ||
1415 | btrfs_path_lock_waiting(p, level)) { | ||
1416 | u32 size = b->len; | ||
1417 | u64 hint = b->start; | ||
1418 | |||
1419 | btrfs_release_path(root, p); | ||
1420 | ret = btrfs_reserve_extent(trans, root, | ||
1421 | size, size, 0, | ||
1422 | hint, (u64)-1, | ||
1423 | &prealloc_block, 0); | ||
1424 | BUG_ON(ret); | ||
1425 | goto again; | ||
1426 | } | ||
1427 | |||
1428 | wret = btrfs_cow_block(trans, root, b, | ||
1429 | p->nodes[level + 1], | ||
1430 | p->slots[level + 1], | ||
1431 | &b, prealloc_block.objectid); | ||
1432 | prealloc_block.objectid = 0; | ||
1433 | if (wret) { | ||
1434 | free_extent_buffer(b); | ||
1435 | ret = wret; | ||
1436 | goto done; | ||
1437 | } | ||
1438 | } | ||
1439 | cow_done: | ||
1440 | BUG_ON(!cow && ins_len); | ||
1441 | if (level != btrfs_header_level(b)) | ||
1442 | WARN_ON(1); | ||
1443 | level = btrfs_header_level(b); | ||
1444 | |||
1445 | p->nodes[level] = b; | ||
1446 | if (!p->skip_locking) | ||
1447 | p->locks[level] = 1; | ||
1448 | |||
1449 | ret = check_block(root, p, level); | ||
1450 | if (ret) { | ||
1451 | ret = -1; | ||
1452 | goto done; | ||
1453 | } | ||
1454 | |||
1455 | ret = bin_search(b, key, level, &slot); | ||
1456 | if (level != 0) { | ||
1457 | if (ret && slot > 0) | ||
1458 | slot -= 1; | ||
1459 | p->slots[level] = slot; | ||
1460 | if ((p->search_for_split || ins_len > 0) && | ||
1461 | btrfs_header_nritems(b) >= | ||
1462 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { | ||
1463 | int sret = split_node(trans, root, p, level); | ||
1464 | BUG_ON(sret > 0); | ||
1465 | if (sret) { | ||
1466 | ret = sret; | ||
1467 | goto done; | ||
1468 | } | ||
1469 | b = p->nodes[level]; | ||
1470 | slot = p->slots[level]; | ||
1471 | } else if (ins_len < 0) { | ||
1472 | int sret = balance_level(trans, root, p, | ||
1473 | level); | ||
1474 | if (sret) { | ||
1475 | ret = sret; | ||
1476 | goto done; | ||
1477 | } | ||
1478 | b = p->nodes[level]; | ||
1479 | if (!b) { | ||
1480 | btrfs_release_path(NULL, p); | ||
1481 | goto again; | ||
1482 | } | ||
1483 | slot = p->slots[level]; | ||
1484 | BUG_ON(btrfs_header_nritems(b) == 1); | ||
1485 | } | ||
1486 | unlock_up(p, level, lowest_unlock); | ||
1487 | |||
1488 | /* this is only true while dropping a snapshot */ | ||
1489 | if (level == lowest_level) { | ||
1490 | ret = 0; | ||
1491 | goto done; | ||
1492 | } | ||
1493 | |||
1494 | blocknr = btrfs_node_blockptr(b, slot); | ||
1495 | gen = btrfs_node_ptr_generation(b, slot); | ||
1496 | blocksize = btrfs_level_size(root, level - 1); | ||
1497 | |||
1498 | tmp = btrfs_find_tree_block(root, blocknr, blocksize); | ||
1499 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { | ||
1500 | b = tmp; | ||
1501 | } else { | ||
1502 | /* | ||
1503 | * reduce lock contention at high levels | ||
1504 | * of the btree by dropping locks before | ||
1505 | * we read. | ||
1506 | */ | ||
1507 | if (level > 1) { | ||
1508 | btrfs_release_path(NULL, p); | ||
1509 | if (tmp) | ||
1510 | free_extent_buffer(tmp); | ||
1511 | if (should_reada) | ||
1512 | reada_for_search(root, p, | ||
1513 | level, slot, | ||
1514 | key->objectid); | ||
1515 | |||
1516 | tmp = read_tree_block(root, blocknr, | ||
1517 | blocksize, gen); | ||
1518 | if (tmp) | ||
1519 | free_extent_buffer(tmp); | ||
1520 | goto again; | ||
1521 | } else { | ||
1522 | if (tmp) | ||
1523 | free_extent_buffer(tmp); | ||
1524 | if (should_reada) | ||
1525 | reada_for_search(root, p, | ||
1526 | level, slot, | ||
1527 | key->objectid); | ||
1528 | b = read_node_slot(root, b, slot); | ||
1529 | } | ||
1530 | } | ||
1531 | if (!p->skip_locking) | ||
1532 | btrfs_tree_lock(b); | ||
1533 | } else { | ||
1534 | p->slots[level] = slot; | ||
1535 | if (ins_len > 0 && | ||
1536 | btrfs_leaf_free_space(root, b) < ins_len) { | ||
1537 | int sret = split_leaf(trans, root, key, | ||
1538 | p, ins_len, ret == 0); | ||
1539 | BUG_ON(sret > 0); | ||
1540 | if (sret) { | ||
1541 | ret = sret; | ||
1542 | goto done; | ||
1543 | } | ||
1544 | } | ||
1545 | if (!p->search_for_split) | ||
1546 | unlock_up(p, level, lowest_unlock); | ||
1547 | goto done; | ||
1548 | } | ||
1549 | } | ||
1550 | ret = 1; | ||
1551 | done: | ||
1552 | if (prealloc_block.objectid) { | ||
1553 | btrfs_free_reserved_extent(root, | ||
1554 | prealloc_block.objectid, | ||
1555 | prealloc_block.offset); | ||
1556 | } | ||
1557 | |||
1558 | return ret; | ||
1559 | } | ||
1560 | |||
1561 | int btrfs_merge_path(struct btrfs_trans_handle *trans, | ||
1562 | struct btrfs_root *root, | ||
1563 | struct btrfs_key *node_keys, | ||
1564 | u64 *nodes, int lowest_level) | ||
1565 | { | ||
1566 | struct extent_buffer *eb; | ||
1567 | struct extent_buffer *parent; | ||
1568 | struct btrfs_key key; | ||
1569 | u64 bytenr; | ||
1570 | u64 generation; | ||
1571 | u32 blocksize; | ||
1572 | int level; | ||
1573 | int slot; | ||
1574 | int key_match; | ||
1575 | int ret; | ||
1576 | |||
1577 | eb = btrfs_lock_root_node(root); | ||
1578 | ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); | ||
1579 | BUG_ON(ret); | ||
1580 | |||
1581 | parent = eb; | ||
1582 | while (1) { | ||
1583 | level = btrfs_header_level(parent); | ||
1584 | if (level == 0 || level <= lowest_level) | ||
1585 | break; | ||
1586 | |||
1587 | ret = bin_search(parent, &node_keys[lowest_level], level, | ||
1588 | &slot); | ||
1589 | if (ret && slot > 0) | ||
1590 | slot--; | ||
1591 | |||
1592 | bytenr = btrfs_node_blockptr(parent, slot); | ||
1593 | if (nodes[level - 1] == bytenr) | ||
1594 | break; | ||
1595 | |||
1596 | blocksize = btrfs_level_size(root, level - 1); | ||
1597 | generation = btrfs_node_ptr_generation(parent, slot); | ||
1598 | btrfs_node_key_to_cpu(eb, &key, slot); | ||
1599 | key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); | ||
1600 | |||
1601 | if (generation == trans->transid) { | ||
1602 | eb = read_tree_block(root, bytenr, blocksize, | ||
1603 | generation); | ||
1604 | btrfs_tree_lock(eb); | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * if node keys match and node pointer hasn't been modified | ||
1609 | * in the running transaction, we can merge the path. for | ||
1610 | * blocks owened by reloc trees, the node pointer check is | ||
1611 | * skipped, this is because these blocks are fully controlled | ||
1612 | * by the space balance code, no one else can modify them. | ||
1613 | */ | ||
1614 | if (!nodes[level - 1] || !key_match || | ||
1615 | (generation == trans->transid && | ||
1616 | btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { | ||
1617 | if (level == 1 || level == lowest_level + 1) { | ||
1618 | if (generation == trans->transid) { | ||
1619 | btrfs_tree_unlock(eb); | ||
1620 | free_extent_buffer(eb); | ||
1621 | } | ||
1622 | break; | ||
1623 | } | ||
1624 | |||
1625 | if (generation != trans->transid) { | ||
1626 | eb = read_tree_block(root, bytenr, blocksize, | ||
1627 | generation); | ||
1628 | btrfs_tree_lock(eb); | ||
1629 | } | ||
1630 | |||
1631 | ret = btrfs_cow_block(trans, root, eb, parent, slot, | ||
1632 | &eb, 0); | ||
1633 | BUG_ON(ret); | ||
1634 | |||
1635 | if (root->root_key.objectid == | ||
1636 | BTRFS_TREE_RELOC_OBJECTID) { | ||
1637 | if (!nodes[level - 1]) { | ||
1638 | nodes[level - 1] = eb->start; | ||
1639 | memcpy(&node_keys[level - 1], &key, | ||
1640 | sizeof(node_keys[0])); | ||
1641 | } else { | ||
1642 | WARN_ON(1); | ||
1643 | } | ||
1644 | } | ||
1645 | |||
1646 | btrfs_tree_unlock(parent); | ||
1647 | free_extent_buffer(parent); | ||
1648 | parent = eb; | ||
1649 | continue; | ||
1650 | } | ||
1651 | |||
1652 | btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); | ||
1653 | btrfs_set_node_ptr_generation(parent, slot, trans->transid); | ||
1654 | btrfs_mark_buffer_dirty(parent); | ||
1655 | |||
1656 | ret = btrfs_inc_extent_ref(trans, root, | ||
1657 | nodes[level - 1], | ||
1658 | blocksize, parent->start, | ||
1659 | btrfs_header_owner(parent), | ||
1660 | btrfs_header_generation(parent), | ||
1661 | level - 1); | ||
1662 | BUG_ON(ret); | ||
1663 | |||
1664 | /* | ||
1665 | * If the block was created in the running transaction, | ||
1666 | * it's possible this is the last reference to it, so we | ||
1667 | * should drop the subtree. | ||
1668 | */ | ||
1669 | if (generation == trans->transid) { | ||
1670 | ret = btrfs_drop_subtree(trans, root, eb, parent); | ||
1671 | BUG_ON(ret); | ||
1672 | btrfs_tree_unlock(eb); | ||
1673 | free_extent_buffer(eb); | ||
1674 | } else { | ||
1675 | ret = btrfs_free_extent(trans, root, bytenr, | ||
1676 | blocksize, parent->start, | ||
1677 | btrfs_header_owner(parent), | ||
1678 | btrfs_header_generation(parent), | ||
1679 | level - 1, 1); | ||
1680 | BUG_ON(ret); | ||
1681 | } | ||
1682 | break; | ||
1683 | } | ||
1684 | btrfs_tree_unlock(parent); | ||
1685 | free_extent_buffer(parent); | ||
1686 | return 0; | ||
1687 | } | ||
1688 | |||
1689 | /* | ||
1690 | * adjust the pointers going up the tree, starting at level | ||
1691 | * making sure the right key of each node is points to 'key'. | ||
1692 | * This is used after shifting pointers to the left, so it stops | ||
1693 | * fixing up pointers when a given leaf/node is not in slot 0 of the | ||
1694 | * higher levels | ||
1695 | * | ||
1696 | * If this fails to write a tree block, it returns -1, but continues | ||
1697 | * fixing up the blocks in ram so the tree is consistent. | ||
1698 | */ | ||
1699 | static int fixup_low_keys(struct btrfs_trans_handle *trans, | ||
1700 | struct btrfs_root *root, struct btrfs_path *path, | ||
1701 | struct btrfs_disk_key *key, int level) | ||
1702 | { | ||
1703 | int i; | ||
1704 | int ret = 0; | ||
1705 | struct extent_buffer *t; | ||
1706 | |||
1707 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | ||
1708 | int tslot = path->slots[i]; | ||
1709 | if (!path->nodes[i]) | ||
1710 | break; | ||
1711 | t = path->nodes[i]; | ||
1712 | btrfs_set_node_key(t, key, tslot); | ||
1713 | btrfs_mark_buffer_dirty(path->nodes[i]); | ||
1714 | if (tslot != 0) | ||
1715 | break; | ||
1716 | } | ||
1717 | return ret; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * update item key. | ||
1722 | * | ||
1723 | * This function isn't completely safe. It's the caller's responsibility | ||
1724 | * that the new key won't break the order | ||
1725 | */ | ||
1726 | int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, | ||
1727 | struct btrfs_root *root, struct btrfs_path *path, | ||
1728 | struct btrfs_key *new_key) | ||
1729 | { | ||
1730 | struct btrfs_disk_key disk_key; | ||
1731 | struct extent_buffer *eb; | ||
1732 | int slot; | ||
1733 | |||
1734 | eb = path->nodes[0]; | ||
1735 | slot = path->slots[0]; | ||
1736 | if (slot > 0) { | ||
1737 | btrfs_item_key(eb, &disk_key, slot - 1); | ||
1738 | if (comp_keys(&disk_key, new_key) >= 0) | ||
1739 | return -1; | ||
1740 | } | ||
1741 | if (slot < btrfs_header_nritems(eb) - 1) { | ||
1742 | btrfs_item_key(eb, &disk_key, slot + 1); | ||
1743 | if (comp_keys(&disk_key, new_key) <= 0) | ||
1744 | return -1; | ||
1745 | } | ||
1746 | |||
1747 | btrfs_cpu_key_to_disk(&disk_key, new_key); | ||
1748 | btrfs_set_item_key(eb, &disk_key, slot); | ||
1749 | btrfs_mark_buffer_dirty(eb); | ||
1750 | if (slot == 0) | ||
1751 | fixup_low_keys(trans, root, path, &disk_key, 1); | ||
1752 | return 0; | ||
1753 | } | ||
1754 | |||
1755 | /* | ||
1756 | * try to push data from one node into the next node left in the | ||
1757 | * tree. | ||
1758 | * | ||
1759 | * returns 0 if some ptrs were pushed left, < 0 if there was some horrible | ||
1760 | * error, and > 0 if there was no room in the left hand block. | ||
1761 | */ | ||
1762 | static int push_node_left(struct btrfs_trans_handle *trans, | ||
1763 | struct btrfs_root *root, struct extent_buffer *dst, | ||
1764 | struct extent_buffer *src, int empty) | ||
1765 | { | ||
1766 | int push_items = 0; | ||
1767 | int src_nritems; | ||
1768 | int dst_nritems; | ||
1769 | int ret = 0; | ||
1770 | |||
1771 | src_nritems = btrfs_header_nritems(src); | ||
1772 | dst_nritems = btrfs_header_nritems(dst); | ||
1773 | push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; | ||
1774 | WARN_ON(btrfs_header_generation(src) != trans->transid); | ||
1775 | WARN_ON(btrfs_header_generation(dst) != trans->transid); | ||
1776 | |||
1777 | if (!empty && src_nritems <= 8) | ||
1778 | return 1; | ||
1779 | |||
1780 | if (push_items <= 0) | ||
1781 | return 1; | ||
1782 | |||
1783 | if (empty) { | ||
1784 | push_items = min(src_nritems, push_items); | ||
1785 | if (push_items < src_nritems) { | ||
1786 | /* leave at least 8 pointers in the node if | ||
1787 | * we aren't going to empty it | ||
1788 | */ | ||
1789 | if (src_nritems - push_items < 8) { | ||
1790 | if (push_items <= 8) | ||
1791 | return 1; | ||
1792 | push_items -= 8; | ||
1793 | } | ||
1794 | } | ||
1795 | } else | ||
1796 | push_items = min(src_nritems - 8, push_items); | ||
1797 | |||
1798 | copy_extent_buffer(dst, src, | ||
1799 | btrfs_node_key_ptr_offset(dst_nritems), | ||
1800 | btrfs_node_key_ptr_offset(0), | ||
1801 | push_items * sizeof(struct btrfs_key_ptr)); | ||
1802 | |||
1803 | if (push_items < src_nritems) { | ||
1804 | memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), | ||
1805 | btrfs_node_key_ptr_offset(push_items), | ||
1806 | (src_nritems - push_items) * | ||
1807 | sizeof(struct btrfs_key_ptr)); | ||
1808 | } | ||
1809 | btrfs_set_header_nritems(src, src_nritems - push_items); | ||
1810 | btrfs_set_header_nritems(dst, dst_nritems + push_items); | ||
1811 | btrfs_mark_buffer_dirty(src); | ||
1812 | btrfs_mark_buffer_dirty(dst); | ||
1813 | |||
1814 | ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); | ||
1815 | BUG_ON(ret); | ||
1816 | |||
1817 | return ret; | ||
1818 | } | ||
1819 | |||
1820 | /* | ||
1821 | * try to push data from one node into the next node right in the | ||
1822 | * tree. | ||
1823 | * | ||
1824 | * returns 0 if some ptrs were pushed, < 0 if there was some horrible | ||
1825 | * error, and > 0 if there was no room in the right hand block. | ||
1826 | * | ||
1827 | * this will only push up to 1/2 the contents of the left node over | ||
1828 | */ | ||
1829 | static int balance_node_right(struct btrfs_trans_handle *trans, | ||
1830 | struct btrfs_root *root, | ||
1831 | struct extent_buffer *dst, | ||
1832 | struct extent_buffer *src) | ||
1833 | { | ||
1834 | int push_items = 0; | ||
1835 | int max_push; | ||
1836 | int src_nritems; | ||
1837 | int dst_nritems; | ||
1838 | int ret = 0; | ||
1839 | |||
1840 | WARN_ON(btrfs_header_generation(src) != trans->transid); | ||
1841 | WARN_ON(btrfs_header_generation(dst) != trans->transid); | ||
1842 | |||
1843 | src_nritems = btrfs_header_nritems(src); | ||
1844 | dst_nritems = btrfs_header_nritems(dst); | ||
1845 | push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; | ||
1846 | if (push_items <= 0) | ||
1847 | return 1; | ||
1848 | |||
1849 | if (src_nritems < 4) | ||
1850 | return 1; | ||
1851 | |||
1852 | max_push = src_nritems / 2 + 1; | ||
1853 | /* don't try to empty the node */ | ||
1854 | if (max_push >= src_nritems) | ||
1855 | return 1; | ||
1856 | |||
1857 | if (max_push < push_items) | ||
1858 | push_items = max_push; | ||
1859 | |||
1860 | memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), | ||
1861 | btrfs_node_key_ptr_offset(0), | ||
1862 | (dst_nritems) * | ||
1863 | sizeof(struct btrfs_key_ptr)); | ||
1864 | |||
1865 | copy_extent_buffer(dst, src, | ||
1866 | btrfs_node_key_ptr_offset(0), | ||
1867 | btrfs_node_key_ptr_offset(src_nritems - push_items), | ||
1868 | push_items * sizeof(struct btrfs_key_ptr)); | ||
1869 | |||
1870 | btrfs_set_header_nritems(src, src_nritems - push_items); | ||
1871 | btrfs_set_header_nritems(dst, dst_nritems + push_items); | ||
1872 | |||
1873 | btrfs_mark_buffer_dirty(src); | ||
1874 | btrfs_mark_buffer_dirty(dst); | ||
1875 | |||
1876 | ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); | ||
1877 | BUG_ON(ret); | ||
1878 | |||
1879 | return ret; | ||
1880 | } | ||
1881 | |||
1882 | /* | ||
1883 | * helper function to insert a new root level in the tree. | ||
1884 | * A new node is allocated, and a single item is inserted to | ||
1885 | * point to the existing root | ||
1886 | * | ||
1887 | * returns zero on success or < 0 on failure. | ||
1888 | */ | ||
1889 | static noinline int insert_new_root(struct btrfs_trans_handle *trans, | ||
1890 | struct btrfs_root *root, | ||
1891 | struct btrfs_path *path, int level) | ||
1892 | { | ||
1893 | u64 lower_gen; | ||
1894 | struct extent_buffer *lower; | ||
1895 | struct extent_buffer *c; | ||
1896 | struct extent_buffer *old; | ||
1897 | struct btrfs_disk_key lower_key; | ||
1898 | int ret; | ||
1899 | |||
1900 | BUG_ON(path->nodes[level]); | ||
1901 | BUG_ON(path->nodes[level-1] != root->node); | ||
1902 | |||
1903 | lower = path->nodes[level-1]; | ||
1904 | if (level == 1) | ||
1905 | btrfs_item_key(lower, &lower_key, 0); | ||
1906 | else | ||
1907 | btrfs_node_key(lower, &lower_key, 0); | ||
1908 | |||
1909 | c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, | ||
1910 | root->root_key.objectid, trans->transid, | ||
1911 | level, root->node->start, 0); | ||
1912 | if (IS_ERR(c)) | ||
1913 | return PTR_ERR(c); | ||
1914 | |||
1915 | memset_extent_buffer(c, 0, 0, root->nodesize); | ||
1916 | btrfs_set_header_nritems(c, 1); | ||
1917 | btrfs_set_header_level(c, level); | ||
1918 | btrfs_set_header_bytenr(c, c->start); | ||
1919 | btrfs_set_header_generation(c, trans->transid); | ||
1920 | btrfs_set_header_owner(c, root->root_key.objectid); | ||
1921 | |||
1922 | write_extent_buffer(c, root->fs_info->fsid, | ||
1923 | (unsigned long)btrfs_header_fsid(c), | ||
1924 | BTRFS_FSID_SIZE); | ||
1925 | |||
1926 | write_extent_buffer(c, root->fs_info->chunk_tree_uuid, | ||
1927 | (unsigned long)btrfs_header_chunk_tree_uuid(c), | ||
1928 | BTRFS_UUID_SIZE); | ||
1929 | |||
1930 | btrfs_set_node_key(c, &lower_key, 0); | ||
1931 | btrfs_set_node_blockptr(c, 0, lower->start); | ||
1932 | lower_gen = btrfs_header_generation(lower); | ||
1933 | WARN_ON(lower_gen != trans->transid); | ||
1934 | |||
1935 | btrfs_set_node_ptr_generation(c, 0, lower_gen); | ||
1936 | |||
1937 | btrfs_mark_buffer_dirty(c); | ||
1938 | |||
1939 | spin_lock(&root->node_lock); | ||
1940 | old = root->node; | ||
1941 | root->node = c; | ||
1942 | spin_unlock(&root->node_lock); | ||
1943 | |||
1944 | ret = btrfs_update_extent_ref(trans, root, lower->start, | ||
1945 | lower->start, c->start, | ||
1946 | root->root_key.objectid, | ||
1947 | trans->transid, level - 1); | ||
1948 | BUG_ON(ret); | ||
1949 | |||
1950 | /* the super has an extra ref to root->node */ | ||
1951 | free_extent_buffer(old); | ||
1952 | |||
1953 | add_root_to_dirty_list(root); | ||
1954 | extent_buffer_get(c); | ||
1955 | path->nodes[level] = c; | ||
1956 | path->locks[level] = 1; | ||
1957 | path->slots[level] = 0; | ||
1958 | return 0; | ||
1959 | } | ||
1960 | |||
1961 | /* | ||
1962 | * worker function to insert a single pointer in a node. | ||
1963 | * the node should have enough room for the pointer already | ||
1964 | * | ||
1965 | * slot and level indicate where you want the key to go, and | ||
1966 | * blocknr is the block the key points to. | ||
1967 | * | ||
1968 | * returns zero on success and < 0 on any error | ||
1969 | */ | ||
1970 | static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1971 | *root, struct btrfs_path *path, struct btrfs_disk_key | ||
1972 | *key, u64 bytenr, int slot, int level) | ||
1973 | { | ||
1974 | struct extent_buffer *lower; | ||
1975 | int nritems; | ||
1976 | |||
1977 | BUG_ON(!path->nodes[level]); | ||
1978 | lower = path->nodes[level]; | ||
1979 | nritems = btrfs_header_nritems(lower); | ||
1980 | if (slot > nritems) | ||
1981 | BUG(); | ||
1982 | if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) | ||
1983 | BUG(); | ||
1984 | if (slot != nritems) { | ||
1985 | memmove_extent_buffer(lower, | ||
1986 | btrfs_node_key_ptr_offset(slot + 1), | ||
1987 | btrfs_node_key_ptr_offset(slot), | ||
1988 | (nritems - slot) * sizeof(struct btrfs_key_ptr)); | ||
1989 | } | ||
1990 | btrfs_set_node_key(lower, key, slot); | ||
1991 | btrfs_set_node_blockptr(lower, slot, bytenr); | ||
1992 | WARN_ON(trans->transid == 0); | ||
1993 | btrfs_set_node_ptr_generation(lower, slot, trans->transid); | ||
1994 | btrfs_set_header_nritems(lower, nritems + 1); | ||
1995 | btrfs_mark_buffer_dirty(lower); | ||
1996 | return 0; | ||
1997 | } | ||
1998 | |||
1999 | /* | ||
2000 | * split the node at the specified level in path in two. | ||
2001 | * The path is corrected to point to the appropriate node after the split | ||
2002 | * | ||
2003 | * Before splitting this tries to make some room in the node by pushing | ||
2004 | * left and right, if either one works, it returns right away. | ||
2005 | * | ||
2006 | * returns 0 on success and < 0 on failure | ||
2007 | */ | ||
2008 | static noinline int split_node(struct btrfs_trans_handle *trans, | ||
2009 | struct btrfs_root *root, | ||
2010 | struct btrfs_path *path, int level) | ||
2011 | { | ||
2012 | struct extent_buffer *c; | ||
2013 | struct extent_buffer *split; | ||
2014 | struct btrfs_disk_key disk_key; | ||
2015 | int mid; | ||
2016 | int ret; | ||
2017 | int wret; | ||
2018 | u32 c_nritems; | ||
2019 | |||
2020 | c = path->nodes[level]; | ||
2021 | WARN_ON(btrfs_header_generation(c) != trans->transid); | ||
2022 | if (c == root->node) { | ||
2023 | /* trying to split the root, lets make a new one */ | ||
2024 | ret = insert_new_root(trans, root, path, level + 1); | ||
2025 | if (ret) | ||
2026 | return ret; | ||
2027 | } else { | ||
2028 | ret = push_nodes_for_insert(trans, root, path, level); | ||
2029 | c = path->nodes[level]; | ||
2030 | if (!ret && btrfs_header_nritems(c) < | ||
2031 | BTRFS_NODEPTRS_PER_BLOCK(root) - 3) | ||
2032 | return 0; | ||
2033 | if (ret < 0) | ||
2034 | return ret; | ||
2035 | } | ||
2036 | |||
2037 | c_nritems = btrfs_header_nritems(c); | ||
2038 | |||
2039 | split = btrfs_alloc_free_block(trans, root, root->nodesize, | ||
2040 | path->nodes[level + 1]->start, | ||
2041 | root->root_key.objectid, | ||
2042 | trans->transid, level, c->start, 0); | ||
2043 | if (IS_ERR(split)) | ||
2044 | return PTR_ERR(split); | ||
2045 | |||
2046 | btrfs_set_header_flags(split, btrfs_header_flags(c)); | ||
2047 | btrfs_set_header_level(split, btrfs_header_level(c)); | ||
2048 | btrfs_set_header_bytenr(split, split->start); | ||
2049 | btrfs_set_header_generation(split, trans->transid); | ||
2050 | btrfs_set_header_owner(split, root->root_key.objectid); | ||
2051 | btrfs_set_header_flags(split, 0); | ||
2052 | write_extent_buffer(split, root->fs_info->fsid, | ||
2053 | (unsigned long)btrfs_header_fsid(split), | ||
2054 | BTRFS_FSID_SIZE); | ||
2055 | write_extent_buffer(split, root->fs_info->chunk_tree_uuid, | ||
2056 | (unsigned long)btrfs_header_chunk_tree_uuid(split), | ||
2057 | BTRFS_UUID_SIZE); | ||
2058 | |||
2059 | mid = (c_nritems + 1) / 2; | ||
2060 | |||
2061 | copy_extent_buffer(split, c, | ||
2062 | btrfs_node_key_ptr_offset(0), | ||
2063 | btrfs_node_key_ptr_offset(mid), | ||
2064 | (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); | ||
2065 | btrfs_set_header_nritems(split, c_nritems - mid); | ||
2066 | btrfs_set_header_nritems(c, mid); | ||
2067 | ret = 0; | ||
2068 | |||
2069 | btrfs_mark_buffer_dirty(c); | ||
2070 | btrfs_mark_buffer_dirty(split); | ||
2071 | |||
2072 | btrfs_node_key(split, &disk_key, 0); | ||
2073 | wret = insert_ptr(trans, root, path, &disk_key, split->start, | ||
2074 | path->slots[level + 1] + 1, | ||
2075 | level + 1); | ||
2076 | if (wret) | ||
2077 | ret = wret; | ||
2078 | |||
2079 | ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); | ||
2080 | BUG_ON(ret); | ||
2081 | |||
2082 | if (path->slots[level] >= mid) { | ||
2083 | path->slots[level] -= mid; | ||
2084 | btrfs_tree_unlock(c); | ||
2085 | free_extent_buffer(c); | ||
2086 | path->nodes[level] = split; | ||
2087 | path->slots[level + 1] += 1; | ||
2088 | } else { | ||
2089 | btrfs_tree_unlock(split); | ||
2090 | free_extent_buffer(split); | ||
2091 | } | ||
2092 | return ret; | ||
2093 | } | ||
2094 | |||
2095 | /* | ||
2096 | * how many bytes are required to store the items in a leaf. start | ||
2097 | * and nr indicate which items in the leaf to check. This totals up the | ||
2098 | * space used both by the item structs and the item data | ||
2099 | */ | ||
2100 | static int leaf_space_used(struct extent_buffer *l, int start, int nr) | ||
2101 | { | ||
2102 | int data_len; | ||
2103 | int nritems = btrfs_header_nritems(l); | ||
2104 | int end = min(nritems, start + nr) - 1; | ||
2105 | |||
2106 | if (!nr) | ||
2107 | return 0; | ||
2108 | data_len = btrfs_item_end_nr(l, start); | ||
2109 | data_len = data_len - btrfs_item_offset_nr(l, end); | ||
2110 | data_len += sizeof(struct btrfs_item) * nr; | ||
2111 | WARN_ON(data_len < 0); | ||
2112 | return data_len; | ||
2113 | } | ||
2114 | |||
2115 | /* | ||
2116 | * The space between the end of the leaf items and | ||
2117 | * the start of the leaf data. IOW, how much room | ||
2118 | * the leaf has left for both items and data | ||
2119 | */ | ||
2120 | noinline int btrfs_leaf_free_space(struct btrfs_root *root, | ||
2121 | struct extent_buffer *leaf) | ||
2122 | { | ||
2123 | int nritems = btrfs_header_nritems(leaf); | ||
2124 | int ret; | ||
2125 | ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); | ||
2126 | if (ret < 0) { | ||
2127 | printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " | ||
2128 | "used %d nritems %d\n", | ||
2129 | ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), | ||
2130 | leaf_space_used(leaf, 0, nritems), nritems); | ||
2131 | } | ||
2132 | return ret; | ||
2133 | } | ||
2134 | |||
2135 | /* | ||
2136 | * push some data in the path leaf to the right, trying to free up at | ||
2137 | * least data_size bytes. returns zero if the push worked, nonzero otherwise | ||
2138 | * | ||
2139 | * returns 1 if the push failed because the other node didn't have enough | ||
2140 | * room, 0 if everything worked out and < 0 if there were major errors. | ||
2141 | */ | ||
2142 | static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root | ||
2143 | *root, struct btrfs_path *path, int data_size, | ||
2144 | int empty) | ||
2145 | { | ||
2146 | struct extent_buffer *left = path->nodes[0]; | ||
2147 | struct extent_buffer *right; | ||
2148 | struct extent_buffer *upper; | ||
2149 | struct btrfs_disk_key disk_key; | ||
2150 | int slot; | ||
2151 | u32 i; | ||
2152 | int free_space; | ||
2153 | int push_space = 0; | ||
2154 | int push_items = 0; | ||
2155 | struct btrfs_item *item; | ||
2156 | u32 left_nritems; | ||
2157 | u32 nr; | ||
2158 | u32 right_nritems; | ||
2159 | u32 data_end; | ||
2160 | u32 this_item_size; | ||
2161 | int ret; | ||
2162 | |||
2163 | slot = path->slots[1]; | ||
2164 | if (!path->nodes[1]) | ||
2165 | return 1; | ||
2166 | |||
2167 | upper = path->nodes[1]; | ||
2168 | if (slot >= btrfs_header_nritems(upper) - 1) | ||
2169 | return 1; | ||
2170 | |||
2171 | WARN_ON(!btrfs_tree_locked(path->nodes[1])); | ||
2172 | |||
2173 | right = read_node_slot(root, upper, slot + 1); | ||
2174 | btrfs_tree_lock(right); | ||
2175 | free_space = btrfs_leaf_free_space(root, right); | ||
2176 | if (free_space < data_size) | ||
2177 | goto out_unlock; | ||
2178 | |||
2179 | /* cow and double check */ | ||
2180 | ret = btrfs_cow_block(trans, root, right, upper, | ||
2181 | slot + 1, &right, 0); | ||
2182 | if (ret) | ||
2183 | goto out_unlock; | ||
2184 | |||
2185 | free_space = btrfs_leaf_free_space(root, right); | ||
2186 | if (free_space < data_size) | ||
2187 | goto out_unlock; | ||
2188 | |||
2189 | left_nritems = btrfs_header_nritems(left); | ||
2190 | if (left_nritems == 0) | ||
2191 | goto out_unlock; | ||
2192 | |||
2193 | if (empty) | ||
2194 | nr = 0; | ||
2195 | else | ||
2196 | nr = 1; | ||
2197 | |||
2198 | if (path->slots[0] >= left_nritems) | ||
2199 | push_space += data_size; | ||
2200 | |||
2201 | i = left_nritems - 1; | ||
2202 | while (i >= nr) { | ||
2203 | item = btrfs_item_nr(left, i); | ||
2204 | |||
2205 | if (!empty && push_items > 0) { | ||
2206 | if (path->slots[0] > i) | ||
2207 | break; | ||
2208 | if (path->slots[0] == i) { | ||
2209 | int space = btrfs_leaf_free_space(root, left); | ||
2210 | if (space + push_space * 2 > free_space) | ||
2211 | break; | ||
2212 | } | ||
2213 | } | ||
2214 | |||
2215 | if (path->slots[0] == i) | ||
2216 | push_space += data_size; | ||
2217 | |||
2218 | if (!left->map_token) { | ||
2219 | map_extent_buffer(left, (unsigned long)item, | ||
2220 | sizeof(struct btrfs_item), | ||
2221 | &left->map_token, &left->kaddr, | ||
2222 | &left->map_start, &left->map_len, | ||
2223 | KM_USER1); | ||
2224 | } | ||
2225 | |||
2226 | this_item_size = btrfs_item_size(left, item); | ||
2227 | if (this_item_size + sizeof(*item) + push_space > free_space) | ||
2228 | break; | ||
2229 | |||
2230 | push_items++; | ||
2231 | push_space += this_item_size + sizeof(*item); | ||
2232 | if (i == 0) | ||
2233 | break; | ||
2234 | i--; | ||
2235 | } | ||
2236 | if (left->map_token) { | ||
2237 | unmap_extent_buffer(left, left->map_token, KM_USER1); | ||
2238 | left->map_token = NULL; | ||
2239 | } | ||
2240 | |||
2241 | if (push_items == 0) | ||
2242 | goto out_unlock; | ||
2243 | |||
2244 | if (!empty && push_items == left_nritems) | ||
2245 | WARN_ON(1); | ||
2246 | |||
2247 | /* push left to right */ | ||
2248 | right_nritems = btrfs_header_nritems(right); | ||
2249 | |||
2250 | push_space = btrfs_item_end_nr(left, left_nritems - push_items); | ||
2251 | push_space -= leaf_data_end(root, left); | ||
2252 | |||
2253 | /* make room in the right data area */ | ||
2254 | data_end = leaf_data_end(root, right); | ||
2255 | memmove_extent_buffer(right, | ||
2256 | btrfs_leaf_data(right) + data_end - push_space, | ||
2257 | btrfs_leaf_data(right) + data_end, | ||
2258 | BTRFS_LEAF_DATA_SIZE(root) - data_end); | ||
2259 | |||
2260 | /* copy from the left data area */ | ||
2261 | copy_extent_buffer(right, left, btrfs_leaf_data(right) + | ||
2262 | BTRFS_LEAF_DATA_SIZE(root) - push_space, | ||
2263 | btrfs_leaf_data(left) + leaf_data_end(root, left), | ||
2264 | push_space); | ||
2265 | |||
2266 | memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), | ||
2267 | btrfs_item_nr_offset(0), | ||
2268 | right_nritems * sizeof(struct btrfs_item)); | ||
2269 | |||
2270 | /* copy the items from left to right */ | ||
2271 | copy_extent_buffer(right, left, btrfs_item_nr_offset(0), | ||
2272 | btrfs_item_nr_offset(left_nritems - push_items), | ||
2273 | push_items * sizeof(struct btrfs_item)); | ||
2274 | |||
2275 | /* update the item pointers */ | ||
2276 | right_nritems += push_items; | ||
2277 | btrfs_set_header_nritems(right, right_nritems); | ||
2278 | push_space = BTRFS_LEAF_DATA_SIZE(root); | ||
2279 | for (i = 0; i < right_nritems; i++) { | ||
2280 | item = btrfs_item_nr(right, i); | ||
2281 | if (!right->map_token) { | ||
2282 | map_extent_buffer(right, (unsigned long)item, | ||
2283 | sizeof(struct btrfs_item), | ||
2284 | &right->map_token, &right->kaddr, | ||
2285 | &right->map_start, &right->map_len, | ||
2286 | KM_USER1); | ||
2287 | } | ||
2288 | push_space -= btrfs_item_size(right, item); | ||
2289 | btrfs_set_item_offset(right, item, push_space); | ||
2290 | } | ||
2291 | |||
2292 | if (right->map_token) { | ||
2293 | unmap_extent_buffer(right, right->map_token, KM_USER1); | ||
2294 | right->map_token = NULL; | ||
2295 | } | ||
2296 | left_nritems -= push_items; | ||
2297 | btrfs_set_header_nritems(left, left_nritems); | ||
2298 | |||
2299 | if (left_nritems) | ||
2300 | btrfs_mark_buffer_dirty(left); | ||
2301 | btrfs_mark_buffer_dirty(right); | ||
2302 | |||
2303 | ret = btrfs_update_ref(trans, root, left, right, 0, push_items); | ||
2304 | BUG_ON(ret); | ||
2305 | |||
2306 | btrfs_item_key(right, &disk_key, 0); | ||
2307 | btrfs_set_node_key(upper, &disk_key, slot + 1); | ||
2308 | btrfs_mark_buffer_dirty(upper); | ||
2309 | |||
2310 | /* then fixup the leaf pointer in the path */ | ||
2311 | if (path->slots[0] >= left_nritems) { | ||
2312 | path->slots[0] -= left_nritems; | ||
2313 | if (btrfs_header_nritems(path->nodes[0]) == 0) | ||
2314 | clean_tree_block(trans, root, path->nodes[0]); | ||
2315 | btrfs_tree_unlock(path->nodes[0]); | ||
2316 | free_extent_buffer(path->nodes[0]); | ||
2317 | path->nodes[0] = right; | ||
2318 | path->slots[1] += 1; | ||
2319 | } else { | ||
2320 | btrfs_tree_unlock(right); | ||
2321 | free_extent_buffer(right); | ||
2322 | } | ||
2323 | return 0; | ||
2324 | |||
2325 | out_unlock: | ||
2326 | btrfs_tree_unlock(right); | ||
2327 | free_extent_buffer(right); | ||
2328 | return 1; | ||
2329 | } | ||
2330 | |||
2331 | /* | ||
2332 | * push some data in the path leaf to the left, trying to free up at | ||
2333 | * least data_size bytes. returns zero if the push worked, nonzero otherwise | ||
2334 | */ | ||
2335 | static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root | ||
2336 | *root, struct btrfs_path *path, int data_size, | ||
2337 | int empty) | ||
2338 | { | ||
2339 | struct btrfs_disk_key disk_key; | ||
2340 | struct extent_buffer *right = path->nodes[0]; | ||
2341 | struct extent_buffer *left; | ||
2342 | int slot; | ||
2343 | int i; | ||
2344 | int free_space; | ||
2345 | int push_space = 0; | ||
2346 | int push_items = 0; | ||
2347 | struct btrfs_item *item; | ||
2348 | u32 old_left_nritems; | ||
2349 | u32 right_nritems; | ||
2350 | u32 nr; | ||
2351 | int ret = 0; | ||
2352 | int wret; | ||
2353 | u32 this_item_size; | ||
2354 | u32 old_left_item_size; | ||
2355 | |||
2356 | slot = path->slots[1]; | ||
2357 | if (slot == 0) | ||
2358 | return 1; | ||
2359 | if (!path->nodes[1]) | ||
2360 | return 1; | ||
2361 | |||
2362 | right_nritems = btrfs_header_nritems(right); | ||
2363 | if (right_nritems == 0) | ||
2364 | return 1; | ||
2365 | |||
2366 | WARN_ON(!btrfs_tree_locked(path->nodes[1])); | ||
2367 | |||
2368 | left = read_node_slot(root, path->nodes[1], slot - 1); | ||
2369 | btrfs_tree_lock(left); | ||
2370 | free_space = btrfs_leaf_free_space(root, left); | ||
2371 | if (free_space < data_size) { | ||
2372 | ret = 1; | ||
2373 | goto out; | ||
2374 | } | ||
2375 | |||
2376 | /* cow and double check */ | ||
2377 | ret = btrfs_cow_block(trans, root, left, | ||
2378 | path->nodes[1], slot - 1, &left, 0); | ||
2379 | if (ret) { | ||
2380 | /* we hit -ENOSPC, but it isn't fatal here */ | ||
2381 | ret = 1; | ||
2382 | goto out; | ||
2383 | } | ||
2384 | |||
2385 | free_space = btrfs_leaf_free_space(root, left); | ||
2386 | if (free_space < data_size) { | ||
2387 | ret = 1; | ||
2388 | goto out; | ||
2389 | } | ||
2390 | |||
2391 | if (empty) | ||
2392 | nr = right_nritems; | ||
2393 | else | ||
2394 | nr = right_nritems - 1; | ||
2395 | |||
2396 | for (i = 0; i < nr; i++) { | ||
2397 | item = btrfs_item_nr(right, i); | ||
2398 | if (!right->map_token) { | ||
2399 | map_extent_buffer(right, (unsigned long)item, | ||
2400 | sizeof(struct btrfs_item), | ||
2401 | &right->map_token, &right->kaddr, | ||
2402 | &right->map_start, &right->map_len, | ||
2403 | KM_USER1); | ||
2404 | } | ||
2405 | |||
2406 | if (!empty && push_items > 0) { | ||
2407 | if (path->slots[0] < i) | ||
2408 | break; | ||
2409 | if (path->slots[0] == i) { | ||
2410 | int space = btrfs_leaf_free_space(root, right); | ||
2411 | if (space + push_space * 2 > free_space) | ||
2412 | break; | ||
2413 | } | ||
2414 | } | ||
2415 | |||
2416 | if (path->slots[0] == i) | ||
2417 | push_space += data_size; | ||
2418 | |||
2419 | this_item_size = btrfs_item_size(right, item); | ||
2420 | if (this_item_size + sizeof(*item) + push_space > free_space) | ||
2421 | break; | ||
2422 | |||
2423 | push_items++; | ||
2424 | push_space += this_item_size + sizeof(*item); | ||
2425 | } | ||
2426 | |||
2427 | if (right->map_token) { | ||
2428 | unmap_extent_buffer(right, right->map_token, KM_USER1); | ||
2429 | right->map_token = NULL; | ||
2430 | } | ||
2431 | |||
2432 | if (push_items == 0) { | ||
2433 | ret = 1; | ||
2434 | goto out; | ||
2435 | } | ||
2436 | if (!empty && push_items == btrfs_header_nritems(right)) | ||
2437 | WARN_ON(1); | ||
2438 | |||
2439 | /* push data from right to left */ | ||
2440 | copy_extent_buffer(left, right, | ||
2441 | btrfs_item_nr_offset(btrfs_header_nritems(left)), | ||
2442 | btrfs_item_nr_offset(0), | ||
2443 | push_items * sizeof(struct btrfs_item)); | ||
2444 | |||
2445 | push_space = BTRFS_LEAF_DATA_SIZE(root) - | ||
2446 | btrfs_item_offset_nr(right, push_items - 1); | ||
2447 | |||
2448 | copy_extent_buffer(left, right, btrfs_leaf_data(left) + | ||
2449 | leaf_data_end(root, left) - push_space, | ||
2450 | btrfs_leaf_data(right) + | ||
2451 | btrfs_item_offset_nr(right, push_items - 1), | ||
2452 | push_space); | ||
2453 | old_left_nritems = btrfs_header_nritems(left); | ||
2454 | BUG_ON(old_left_nritems <= 0); | ||
2455 | |||
2456 | old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); | ||
2457 | for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { | ||
2458 | u32 ioff; | ||
2459 | |||
2460 | item = btrfs_item_nr(left, i); | ||
2461 | if (!left->map_token) { | ||
2462 | map_extent_buffer(left, (unsigned long)item, | ||
2463 | sizeof(struct btrfs_item), | ||
2464 | &left->map_token, &left->kaddr, | ||
2465 | &left->map_start, &left->map_len, | ||
2466 | KM_USER1); | ||
2467 | } | ||
2468 | |||
2469 | ioff = btrfs_item_offset(left, item); | ||
2470 | btrfs_set_item_offset(left, item, | ||
2471 | ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); | ||
2472 | } | ||
2473 | btrfs_set_header_nritems(left, old_left_nritems + push_items); | ||
2474 | if (left->map_token) { | ||
2475 | unmap_extent_buffer(left, left->map_token, KM_USER1); | ||
2476 | left->map_token = NULL; | ||
2477 | } | ||
2478 | |||
2479 | /* fixup right node */ | ||
2480 | if (push_items > right_nritems) { | ||
2481 | printk(KERN_CRIT "push items %d nr %u\n", push_items, | ||
2482 | right_nritems); | ||
2483 | WARN_ON(1); | ||
2484 | } | ||
2485 | |||
2486 | if (push_items < right_nritems) { | ||
2487 | push_space = btrfs_item_offset_nr(right, push_items - 1) - | ||
2488 | leaf_data_end(root, right); | ||
2489 | memmove_extent_buffer(right, btrfs_leaf_data(right) + | ||
2490 | BTRFS_LEAF_DATA_SIZE(root) - push_space, | ||
2491 | btrfs_leaf_data(right) + | ||
2492 | leaf_data_end(root, right), push_space); | ||
2493 | |||
2494 | memmove_extent_buffer(right, btrfs_item_nr_offset(0), | ||
2495 | btrfs_item_nr_offset(push_items), | ||
2496 | (btrfs_header_nritems(right) - push_items) * | ||
2497 | sizeof(struct btrfs_item)); | ||
2498 | } | ||
2499 | right_nritems -= push_items; | ||
2500 | btrfs_set_header_nritems(right, right_nritems); | ||
2501 | push_space = BTRFS_LEAF_DATA_SIZE(root); | ||
2502 | for (i = 0; i < right_nritems; i++) { | ||
2503 | item = btrfs_item_nr(right, i); | ||
2504 | |||
2505 | if (!right->map_token) { | ||
2506 | map_extent_buffer(right, (unsigned long)item, | ||
2507 | sizeof(struct btrfs_item), | ||
2508 | &right->map_token, &right->kaddr, | ||
2509 | &right->map_start, &right->map_len, | ||
2510 | KM_USER1); | ||
2511 | } | ||
2512 | |||
2513 | push_space = push_space - btrfs_item_size(right, item); | ||
2514 | btrfs_set_item_offset(right, item, push_space); | ||
2515 | } | ||
2516 | if (right->map_token) { | ||
2517 | unmap_extent_buffer(right, right->map_token, KM_USER1); | ||
2518 | right->map_token = NULL; | ||
2519 | } | ||
2520 | |||
2521 | btrfs_mark_buffer_dirty(left); | ||
2522 | if (right_nritems) | ||
2523 | btrfs_mark_buffer_dirty(right); | ||
2524 | |||
2525 | ret = btrfs_update_ref(trans, root, right, left, | ||
2526 | old_left_nritems, push_items); | ||
2527 | BUG_ON(ret); | ||
2528 | |||
2529 | btrfs_item_key(right, &disk_key, 0); | ||
2530 | wret = fixup_low_keys(trans, root, path, &disk_key, 1); | ||
2531 | if (wret) | ||
2532 | ret = wret; | ||
2533 | |||
2534 | /* then fixup the leaf pointer in the path */ | ||
2535 | if (path->slots[0] < push_items) { | ||
2536 | path->slots[0] += old_left_nritems; | ||
2537 | if (btrfs_header_nritems(path->nodes[0]) == 0) | ||
2538 | clean_tree_block(trans, root, path->nodes[0]); | ||
2539 | btrfs_tree_unlock(path->nodes[0]); | ||
2540 | free_extent_buffer(path->nodes[0]); | ||
2541 | path->nodes[0] = left; | ||
2542 | path->slots[1] -= 1; | ||
2543 | } else { | ||
2544 | btrfs_tree_unlock(left); | ||
2545 | free_extent_buffer(left); | ||
2546 | path->slots[0] -= push_items; | ||
2547 | } | ||
2548 | BUG_ON(path->slots[0] < 0); | ||
2549 | return ret; | ||
2550 | out: | ||
2551 | btrfs_tree_unlock(left); | ||
2552 | free_extent_buffer(left); | ||
2553 | return ret; | ||
2554 | } | ||
2555 | |||
2556 | /* | ||
2557 | * split the path's leaf in two, making sure there is at least data_size | ||
2558 | * available for the resulting leaf level of the path. | ||
2559 | * | ||
2560 | * returns 0 if all went well and < 0 on failure. | ||
2561 | */ | ||
2562 | static noinline int split_leaf(struct btrfs_trans_handle *trans, | ||
2563 | struct btrfs_root *root, | ||
2564 | struct btrfs_key *ins_key, | ||
2565 | struct btrfs_path *path, int data_size, | ||
2566 | int extend) | ||
2567 | { | ||
2568 | struct extent_buffer *l; | ||
2569 | u32 nritems; | ||
2570 | int mid; | ||
2571 | int slot; | ||
2572 | struct extent_buffer *right; | ||
2573 | int data_copy_size; | ||
2574 | int rt_data_off; | ||
2575 | int i; | ||
2576 | int ret = 0; | ||
2577 | int wret; | ||
2578 | int double_split; | ||
2579 | int num_doubles = 0; | ||
2580 | struct btrfs_disk_key disk_key; | ||
2581 | |||
2582 | /* first try to make some room by pushing left and right */ | ||
2583 | if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { | ||
2584 | wret = push_leaf_right(trans, root, path, data_size, 0); | ||
2585 | if (wret < 0) | ||
2586 | return wret; | ||
2587 | if (wret) { | ||
2588 | wret = push_leaf_left(trans, root, path, data_size, 0); | ||
2589 | if (wret < 0) | ||
2590 | return wret; | ||
2591 | } | ||
2592 | l = path->nodes[0]; | ||
2593 | |||
2594 | /* did the pushes work? */ | ||
2595 | if (btrfs_leaf_free_space(root, l) >= data_size) | ||
2596 | return 0; | ||
2597 | } | ||
2598 | |||
2599 | if (!path->nodes[1]) { | ||
2600 | ret = insert_new_root(trans, root, path, 1); | ||
2601 | if (ret) | ||
2602 | return ret; | ||
2603 | } | ||
2604 | again: | ||
2605 | double_split = 0; | ||
2606 | l = path->nodes[0]; | ||
2607 | slot = path->slots[0]; | ||
2608 | nritems = btrfs_header_nritems(l); | ||
2609 | mid = (nritems + 1) / 2; | ||
2610 | |||
2611 | right = btrfs_alloc_free_block(trans, root, root->leafsize, | ||
2612 | path->nodes[1]->start, | ||
2613 | root->root_key.objectid, | ||
2614 | trans->transid, 0, l->start, 0); | ||
2615 | if (IS_ERR(right)) { | ||
2616 | BUG_ON(1); | ||
2617 | return PTR_ERR(right); | ||
2618 | } | ||
2619 | |||
2620 | memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); | ||
2621 | btrfs_set_header_bytenr(right, right->start); | ||
2622 | btrfs_set_header_generation(right, trans->transid); | ||
2623 | btrfs_set_header_owner(right, root->root_key.objectid); | ||
2624 | btrfs_set_header_level(right, 0); | ||
2625 | write_extent_buffer(right, root->fs_info->fsid, | ||
2626 | (unsigned long)btrfs_header_fsid(right), | ||
2627 | BTRFS_FSID_SIZE); | ||
2628 | |||
2629 | write_extent_buffer(right, root->fs_info->chunk_tree_uuid, | ||
2630 | (unsigned long)btrfs_header_chunk_tree_uuid(right), | ||
2631 | BTRFS_UUID_SIZE); | ||
2632 | if (mid <= slot) { | ||
2633 | if (nritems == 1 || | ||
2634 | leaf_space_used(l, mid, nritems - mid) + data_size > | ||
2635 | BTRFS_LEAF_DATA_SIZE(root)) { | ||
2636 | if (slot >= nritems) { | ||
2637 | btrfs_cpu_key_to_disk(&disk_key, ins_key); | ||
2638 | btrfs_set_header_nritems(right, 0); | ||
2639 | wret = insert_ptr(trans, root, path, | ||
2640 | &disk_key, right->start, | ||
2641 | path->slots[1] + 1, 1); | ||
2642 | if (wret) | ||
2643 | ret = wret; | ||
2644 | |||
2645 | btrfs_tree_unlock(path->nodes[0]); | ||
2646 | free_extent_buffer(path->nodes[0]); | ||
2647 | path->nodes[0] = right; | ||
2648 | path->slots[0] = 0; | ||
2649 | path->slots[1] += 1; | ||
2650 | btrfs_mark_buffer_dirty(right); | ||
2651 | return ret; | ||
2652 | } | ||
2653 | mid = slot; | ||
2654 | if (mid != nritems && | ||
2655 | leaf_space_used(l, mid, nritems - mid) + | ||
2656 | data_size > BTRFS_LEAF_DATA_SIZE(root)) { | ||
2657 | double_split = 1; | ||
2658 | } | ||
2659 | } | ||
2660 | } else { | ||
2661 | if (leaf_space_used(l, 0, mid) + data_size > | ||
2662 | BTRFS_LEAF_DATA_SIZE(root)) { | ||
2663 | if (!extend && data_size && slot == 0) { | ||
2664 | btrfs_cpu_key_to_disk(&disk_key, ins_key); | ||
2665 | btrfs_set_header_nritems(right, 0); | ||
2666 | wret = insert_ptr(trans, root, path, | ||
2667 | &disk_key, | ||
2668 | right->start, | ||
2669 | path->slots[1], 1); | ||
2670 | if (wret) | ||
2671 | ret = wret; | ||
2672 | btrfs_tree_unlock(path->nodes[0]); | ||
2673 | free_extent_buffer(path->nodes[0]); | ||
2674 | path->nodes[0] = right; | ||
2675 | path->slots[0] = 0; | ||
2676 | if (path->slots[1] == 0) { | ||
2677 | wret = fixup_low_keys(trans, root, | ||
2678 | path, &disk_key, 1); | ||
2679 | if (wret) | ||
2680 | ret = wret; | ||
2681 | } | ||
2682 | btrfs_mark_buffer_dirty(right); | ||
2683 | return ret; | ||
2684 | } else if ((extend || !data_size) && slot == 0) { | ||
2685 | mid = 1; | ||
2686 | } else { | ||
2687 | mid = slot; | ||
2688 | if (mid != nritems && | ||
2689 | leaf_space_used(l, mid, nritems - mid) + | ||
2690 | data_size > BTRFS_LEAF_DATA_SIZE(root)) { | ||
2691 | double_split = 1; | ||
2692 | } | ||
2693 | } | ||
2694 | } | ||
2695 | } | ||
2696 | nritems = nritems - mid; | ||
2697 | btrfs_set_header_nritems(right, nritems); | ||
2698 | data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); | ||
2699 | |||
2700 | copy_extent_buffer(right, l, btrfs_item_nr_offset(0), | ||
2701 | btrfs_item_nr_offset(mid), | ||
2702 | nritems * sizeof(struct btrfs_item)); | ||
2703 | |||
2704 | copy_extent_buffer(right, l, | ||
2705 | btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - | ||
2706 | data_copy_size, btrfs_leaf_data(l) + | ||
2707 | leaf_data_end(root, l), data_copy_size); | ||
2708 | |||
2709 | rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - | ||
2710 | btrfs_item_end_nr(l, mid); | ||
2711 | |||
2712 | for (i = 0; i < nritems; i++) { | ||
2713 | struct btrfs_item *item = btrfs_item_nr(right, i); | ||
2714 | u32 ioff; | ||
2715 | |||
2716 | if (!right->map_token) { | ||
2717 | map_extent_buffer(right, (unsigned long)item, | ||
2718 | sizeof(struct btrfs_item), | ||
2719 | &right->map_token, &right->kaddr, | ||
2720 | &right->map_start, &right->map_len, | ||
2721 | KM_USER1); | ||
2722 | } | ||
2723 | |||
2724 | ioff = btrfs_item_offset(right, item); | ||
2725 | btrfs_set_item_offset(right, item, ioff + rt_data_off); | ||
2726 | } | ||
2727 | |||
2728 | if (right->map_token) { | ||
2729 | unmap_extent_buffer(right, right->map_token, KM_USER1); | ||
2730 | right->map_token = NULL; | ||
2731 | } | ||
2732 | |||
2733 | btrfs_set_header_nritems(l, mid); | ||
2734 | ret = 0; | ||
2735 | btrfs_item_key(right, &disk_key, 0); | ||
2736 | wret = insert_ptr(trans, root, path, &disk_key, right->start, | ||
2737 | path->slots[1] + 1, 1); | ||
2738 | if (wret) | ||
2739 | ret = wret; | ||
2740 | |||
2741 | btrfs_mark_buffer_dirty(right); | ||
2742 | btrfs_mark_buffer_dirty(l); | ||
2743 | BUG_ON(path->slots[0] != slot); | ||
2744 | |||
2745 | ret = btrfs_update_ref(trans, root, l, right, 0, nritems); | ||
2746 | BUG_ON(ret); | ||
2747 | |||
2748 | if (mid <= slot) { | ||
2749 | btrfs_tree_unlock(path->nodes[0]); | ||
2750 | free_extent_buffer(path->nodes[0]); | ||
2751 | path->nodes[0] = right; | ||
2752 | path->slots[0] -= mid; | ||
2753 | path->slots[1] += 1; | ||
2754 | } else { | ||
2755 | btrfs_tree_unlock(right); | ||
2756 | free_extent_buffer(right); | ||
2757 | } | ||
2758 | |||
2759 | BUG_ON(path->slots[0] < 0); | ||
2760 | |||
2761 | if (double_split) { | ||
2762 | BUG_ON(num_doubles != 0); | ||
2763 | num_doubles++; | ||
2764 | goto again; | ||
2765 | } | ||
2766 | return ret; | ||
2767 | } | ||
2768 | |||
2769 | /* | ||
2770 | * This function splits a single item into two items, | ||
2771 | * giving 'new_key' to the new item and splitting the | ||
2772 | * old one at split_offset (from the start of the item). | ||
2773 | * | ||
2774 | * The path may be released by this operation. After | ||
2775 | * the split, the path is pointing to the old item. The | ||
2776 | * new item is going to be in the same node as the old one. | ||
2777 | * | ||
2778 | * Note, the item being split must be smaller enough to live alone on | ||
2779 | * a tree block with room for one extra struct btrfs_item | ||
2780 | * | ||
2781 | * This allows us to split the item in place, keeping a lock on the | ||
2782 | * leaf the entire time. | ||
2783 | */ | ||
2784 | int btrfs_split_item(struct btrfs_trans_handle *trans, | ||
2785 | struct btrfs_root *root, | ||
2786 | struct btrfs_path *path, | ||
2787 | struct btrfs_key *new_key, | ||
2788 | unsigned long split_offset) | ||
2789 | { | ||
2790 | u32 item_size; | ||
2791 | struct extent_buffer *leaf; | ||
2792 | struct btrfs_key orig_key; | ||
2793 | struct btrfs_item *item; | ||
2794 | struct btrfs_item *new_item; | ||
2795 | int ret = 0; | ||
2796 | int slot; | ||
2797 | u32 nritems; | ||
2798 | u32 orig_offset; | ||
2799 | struct btrfs_disk_key disk_key; | ||
2800 | char *buf; | ||
2801 | |||
2802 | leaf = path->nodes[0]; | ||
2803 | btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); | ||
2804 | if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) | ||
2805 | goto split; | ||
2806 | |||
2807 | item_size = btrfs_item_size_nr(leaf, path->slots[0]); | ||
2808 | btrfs_release_path(root, path); | ||
2809 | |||
2810 | path->search_for_split = 1; | ||
2811 | path->keep_locks = 1; | ||
2812 | |||
2813 | ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); | ||
2814 | path->search_for_split = 0; | ||
2815 | |||
2816 | /* if our item isn't there or got smaller, return now */ | ||
2817 | if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], | ||
2818 | path->slots[0])) { | ||
2819 | path->keep_locks = 0; | ||
2820 | return -EAGAIN; | ||
2821 | } | ||
2822 | |||
2823 | ret = split_leaf(trans, root, &orig_key, path, | ||
2824 | sizeof(struct btrfs_item), 1); | ||
2825 | path->keep_locks = 0; | ||
2826 | BUG_ON(ret); | ||
2827 | |||
2828 | leaf = path->nodes[0]; | ||
2829 | BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); | ||
2830 | |||
2831 | split: | ||
2832 | item = btrfs_item_nr(leaf, path->slots[0]); | ||
2833 | orig_offset = btrfs_item_offset(leaf, item); | ||
2834 | item_size = btrfs_item_size(leaf, item); | ||
2835 | |||
2836 | |||
2837 | buf = kmalloc(item_size, GFP_NOFS); | ||
2838 | read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, | ||
2839 | path->slots[0]), item_size); | ||
2840 | slot = path->slots[0] + 1; | ||
2841 | leaf = path->nodes[0]; | ||
2842 | |||
2843 | nritems = btrfs_header_nritems(leaf); | ||
2844 | |||
2845 | if (slot != nritems) { | ||
2846 | /* shift the items */ | ||
2847 | memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), | ||
2848 | btrfs_item_nr_offset(slot), | ||
2849 | (nritems - slot) * sizeof(struct btrfs_item)); | ||
2850 | |||
2851 | } | ||
2852 | |||
2853 | btrfs_cpu_key_to_disk(&disk_key, new_key); | ||
2854 | btrfs_set_item_key(leaf, &disk_key, slot); | ||
2855 | |||
2856 | new_item = btrfs_item_nr(leaf, slot); | ||
2857 | |||
2858 | btrfs_set_item_offset(leaf, new_item, orig_offset); | ||
2859 | btrfs_set_item_size(leaf, new_item, item_size - split_offset); | ||
2860 | |||
2861 | btrfs_set_item_offset(leaf, item, | ||
2862 | orig_offset + item_size - split_offset); | ||
2863 | btrfs_set_item_size(leaf, item, split_offset); | ||
2864 | |||
2865 | btrfs_set_header_nritems(leaf, nritems + 1); | ||
2866 | |||
2867 | /* write the data for the start of the original item */ | ||
2868 | write_extent_buffer(leaf, buf, | ||
2869 | btrfs_item_ptr_offset(leaf, path->slots[0]), | ||
2870 | split_offset); | ||
2871 | |||
2872 | /* write the data for the new item */ | ||
2873 | write_extent_buffer(leaf, buf + split_offset, | ||
2874 | btrfs_item_ptr_offset(leaf, slot), | ||
2875 | item_size - split_offset); | ||
2876 | btrfs_mark_buffer_dirty(leaf); | ||
2877 | |||
2878 | ret = 0; | ||
2879 | if (btrfs_leaf_free_space(root, leaf) < 0) { | ||
2880 | btrfs_print_leaf(root, leaf); | ||
2881 | BUG(); | ||
2882 | } | ||
2883 | kfree(buf); | ||
2884 | return ret; | ||
2885 | } | ||
2886 | |||
2887 | /* | ||
2888 | * make the item pointed to by the path smaller. new_size indicates | ||
2889 | * how small to make it, and from_end tells us if we just chop bytes | ||
2890 | * off the end of the item or if we shift the item to chop bytes off | ||
2891 | * the front. | ||
2892 | */ | ||
2893 | int btrfs_truncate_item(struct btrfs_trans_handle *trans, | ||
2894 | struct btrfs_root *root, | ||
2895 | struct btrfs_path *path, | ||
2896 | u32 new_size, int from_end) | ||
2897 | { | ||
2898 | int ret = 0; | ||
2899 | int slot; | ||
2900 | int slot_orig; | ||
2901 | struct extent_buffer *leaf; | ||
2902 | struct btrfs_item *item; | ||
2903 | u32 nritems; | ||
2904 | unsigned int data_end; | ||
2905 | unsigned int old_data_start; | ||
2906 | unsigned int old_size; | ||
2907 | unsigned int size_diff; | ||
2908 | int i; | ||
2909 | |||
2910 | slot_orig = path->slots[0]; | ||
2911 | leaf = path->nodes[0]; | ||
2912 | slot = path->slots[0]; | ||
2913 | |||
2914 | old_size = btrfs_item_size_nr(leaf, slot); | ||
2915 | if (old_size == new_size) | ||
2916 | return 0; | ||
2917 | |||
2918 | nritems = btrfs_header_nritems(leaf); | ||
2919 | data_end = leaf_data_end(root, leaf); | ||
2920 | |||
2921 | old_data_start = btrfs_item_offset_nr(leaf, slot); | ||
2922 | |||
2923 | size_diff = old_size - new_size; | ||
2924 | |||
2925 | BUG_ON(slot < 0); | ||
2926 | BUG_ON(slot >= nritems); | ||
2927 | |||
2928 | /* | ||
2929 | * item0..itemN ... dataN.offset..dataN.size .. data0.size | ||
2930 | */ | ||
2931 | /* first correct the data pointers */ | ||
2932 | for (i = slot; i < nritems; i++) { | ||
2933 | u32 ioff; | ||
2934 | item = btrfs_item_nr(leaf, i); | ||
2935 | |||
2936 | if (!leaf->map_token) { | ||
2937 | map_extent_buffer(leaf, (unsigned long)item, | ||
2938 | sizeof(struct btrfs_item), | ||
2939 | &leaf->map_token, &leaf->kaddr, | ||
2940 | &leaf->map_start, &leaf->map_len, | ||
2941 | KM_USER1); | ||
2942 | } | ||
2943 | |||
2944 | ioff = btrfs_item_offset(leaf, item); | ||
2945 | btrfs_set_item_offset(leaf, item, ioff + size_diff); | ||
2946 | } | ||
2947 | |||
2948 | if (leaf->map_token) { | ||
2949 | unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); | ||
2950 | leaf->map_token = NULL; | ||
2951 | } | ||
2952 | |||
2953 | /* shift the data */ | ||
2954 | if (from_end) { | ||
2955 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
2956 | data_end + size_diff, btrfs_leaf_data(leaf) + | ||
2957 | data_end, old_data_start + new_size - data_end); | ||
2958 | } else { | ||
2959 | struct btrfs_disk_key disk_key; | ||
2960 | u64 offset; | ||
2961 | |||
2962 | btrfs_item_key(leaf, &disk_key, slot); | ||
2963 | |||
2964 | if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { | ||
2965 | unsigned long ptr; | ||
2966 | struct btrfs_file_extent_item *fi; | ||
2967 | |||
2968 | fi = btrfs_item_ptr(leaf, slot, | ||
2969 | struct btrfs_file_extent_item); | ||
2970 | fi = (struct btrfs_file_extent_item *)( | ||
2971 | (unsigned long)fi - size_diff); | ||
2972 | |||
2973 | if (btrfs_file_extent_type(leaf, fi) == | ||
2974 | BTRFS_FILE_EXTENT_INLINE) { | ||
2975 | ptr = btrfs_item_ptr_offset(leaf, slot); | ||
2976 | memmove_extent_buffer(leaf, ptr, | ||
2977 | (unsigned long)fi, | ||
2978 | offsetof(struct btrfs_file_extent_item, | ||
2979 | disk_bytenr)); | ||
2980 | } | ||
2981 | } | ||
2982 | |||
2983 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
2984 | data_end + size_diff, btrfs_leaf_data(leaf) + | ||
2985 | data_end, old_data_start - data_end); | ||
2986 | |||
2987 | offset = btrfs_disk_key_offset(&disk_key); | ||
2988 | btrfs_set_disk_key_offset(&disk_key, offset + size_diff); | ||
2989 | btrfs_set_item_key(leaf, &disk_key, slot); | ||
2990 | if (slot == 0) | ||
2991 | fixup_low_keys(trans, root, path, &disk_key, 1); | ||
2992 | } | ||
2993 | |||
2994 | item = btrfs_item_nr(leaf, slot); | ||
2995 | btrfs_set_item_size(leaf, item, new_size); | ||
2996 | btrfs_mark_buffer_dirty(leaf); | ||
2997 | |||
2998 | ret = 0; | ||
2999 | if (btrfs_leaf_free_space(root, leaf) < 0) { | ||
3000 | btrfs_print_leaf(root, leaf); | ||
3001 | BUG(); | ||
3002 | } | ||
3003 | return ret; | ||
3004 | } | ||
3005 | |||
3006 | /* | ||
3007 | * make the item pointed to by the path bigger, data_size is the new size. | ||
3008 | */ | ||
3009 | int btrfs_extend_item(struct btrfs_trans_handle *trans, | ||
3010 | struct btrfs_root *root, struct btrfs_path *path, | ||
3011 | u32 data_size) | ||
3012 | { | ||
3013 | int ret = 0; | ||
3014 | int slot; | ||
3015 | int slot_orig; | ||
3016 | struct extent_buffer *leaf; | ||
3017 | struct btrfs_item *item; | ||
3018 | u32 nritems; | ||
3019 | unsigned int data_end; | ||
3020 | unsigned int old_data; | ||
3021 | unsigned int old_size; | ||
3022 | int i; | ||
3023 | |||
3024 | slot_orig = path->slots[0]; | ||
3025 | leaf = path->nodes[0]; | ||
3026 | |||
3027 | nritems = btrfs_header_nritems(leaf); | ||
3028 | data_end = leaf_data_end(root, leaf); | ||
3029 | |||
3030 | if (btrfs_leaf_free_space(root, leaf) < data_size) { | ||
3031 | btrfs_print_leaf(root, leaf); | ||
3032 | BUG(); | ||
3033 | } | ||
3034 | slot = path->slots[0]; | ||
3035 | old_data = btrfs_item_end_nr(leaf, slot); | ||
3036 | |||
3037 | BUG_ON(slot < 0); | ||
3038 | if (slot >= nritems) { | ||
3039 | btrfs_print_leaf(root, leaf); | ||
3040 | printk(KERN_CRIT "slot %d too large, nritems %d\n", | ||
3041 | slot, nritems); | ||
3042 | BUG_ON(1); | ||
3043 | } | ||
3044 | |||
3045 | /* | ||
3046 | * item0..itemN ... dataN.offset..dataN.size .. data0.size | ||
3047 | */ | ||
3048 | /* first correct the data pointers */ | ||
3049 | for (i = slot; i < nritems; i++) { | ||
3050 | u32 ioff; | ||
3051 | item = btrfs_item_nr(leaf, i); | ||
3052 | |||
3053 | if (!leaf->map_token) { | ||
3054 | map_extent_buffer(leaf, (unsigned long)item, | ||
3055 | sizeof(struct btrfs_item), | ||
3056 | &leaf->map_token, &leaf->kaddr, | ||
3057 | &leaf->map_start, &leaf->map_len, | ||
3058 | KM_USER1); | ||
3059 | } | ||
3060 | ioff = btrfs_item_offset(leaf, item); | ||
3061 | btrfs_set_item_offset(leaf, item, ioff - data_size); | ||
3062 | } | ||
3063 | |||
3064 | if (leaf->map_token) { | ||
3065 | unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); | ||
3066 | leaf->map_token = NULL; | ||
3067 | } | ||
3068 | |||
3069 | /* shift the data */ | ||
3070 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
3071 | data_end - data_size, btrfs_leaf_data(leaf) + | ||
3072 | data_end, old_data - data_end); | ||
3073 | |||
3074 | data_end = old_data; | ||
3075 | old_size = btrfs_item_size_nr(leaf, slot); | ||
3076 | item = btrfs_item_nr(leaf, slot); | ||
3077 | btrfs_set_item_size(leaf, item, old_size + data_size); | ||
3078 | btrfs_mark_buffer_dirty(leaf); | ||
3079 | |||
3080 | ret = 0; | ||
3081 | if (btrfs_leaf_free_space(root, leaf) < 0) { | ||
3082 | btrfs_print_leaf(root, leaf); | ||
3083 | BUG(); | ||
3084 | } | ||
3085 | return ret; | ||
3086 | } | ||
3087 | |||
3088 | /* | ||
3089 | * Given a key and some data, insert items into the tree. | ||
3090 | * This does all the path init required, making room in the tree if needed. | ||
3091 | * Returns the number of keys that were inserted. | ||
3092 | */ | ||
3093 | int btrfs_insert_some_items(struct btrfs_trans_handle *trans, | ||
3094 | struct btrfs_root *root, | ||
3095 | struct btrfs_path *path, | ||
3096 | struct btrfs_key *cpu_key, u32 *data_size, | ||
3097 | int nr) | ||
3098 | { | ||
3099 | struct extent_buffer *leaf; | ||
3100 | struct btrfs_item *item; | ||
3101 | int ret = 0; | ||
3102 | int slot; | ||
3103 | int i; | ||
3104 | u32 nritems; | ||
3105 | u32 total_data = 0; | ||
3106 | u32 total_size = 0; | ||
3107 | unsigned int data_end; | ||
3108 | struct btrfs_disk_key disk_key; | ||
3109 | struct btrfs_key found_key; | ||
3110 | |||
3111 | for (i = 0; i < nr; i++) { | ||
3112 | if (total_size + data_size[i] + sizeof(struct btrfs_item) > | ||
3113 | BTRFS_LEAF_DATA_SIZE(root)) { | ||
3114 | break; | ||
3115 | nr = i; | ||
3116 | } | ||
3117 | total_data += data_size[i]; | ||
3118 | total_size += data_size[i] + sizeof(struct btrfs_item); | ||
3119 | } | ||
3120 | BUG_ON(nr == 0); | ||
3121 | |||
3122 | ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); | ||
3123 | if (ret == 0) | ||
3124 | return -EEXIST; | ||
3125 | if (ret < 0) | ||
3126 | goto out; | ||
3127 | |||
3128 | leaf = path->nodes[0]; | ||
3129 | |||
3130 | nritems = btrfs_header_nritems(leaf); | ||
3131 | data_end = leaf_data_end(root, leaf); | ||
3132 | |||
3133 | if (btrfs_leaf_free_space(root, leaf) < total_size) { | ||
3134 | for (i = nr; i >= 0; i--) { | ||
3135 | total_data -= data_size[i]; | ||
3136 | total_size -= data_size[i] + sizeof(struct btrfs_item); | ||
3137 | if (total_size < btrfs_leaf_free_space(root, leaf)) | ||
3138 | break; | ||
3139 | } | ||
3140 | nr = i; | ||
3141 | } | ||
3142 | |||
3143 | slot = path->slots[0]; | ||
3144 | BUG_ON(slot < 0); | ||
3145 | |||
3146 | if (slot != nritems) { | ||
3147 | unsigned int old_data = btrfs_item_end_nr(leaf, slot); | ||
3148 | |||
3149 | item = btrfs_item_nr(leaf, slot); | ||
3150 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
3151 | |||
3152 | /* figure out how many keys we can insert in here */ | ||
3153 | total_data = data_size[0]; | ||
3154 | for (i = 1; i < nr; i++) { | ||
3155 | if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) | ||
3156 | break; | ||
3157 | total_data += data_size[i]; | ||
3158 | } | ||
3159 | nr = i; | ||
3160 | |||
3161 | if (old_data < data_end) { | ||
3162 | btrfs_print_leaf(root, leaf); | ||
3163 | printk(KERN_CRIT "slot %d old_data %d data_end %d\n", | ||
3164 | slot, old_data, data_end); | ||
3165 | BUG_ON(1); | ||
3166 | } | ||
3167 | /* | ||
3168 | * item0..itemN ... dataN.offset..dataN.size .. data0.size | ||
3169 | */ | ||
3170 | /* first correct the data pointers */ | ||
3171 | WARN_ON(leaf->map_token); | ||
3172 | for (i = slot; i < nritems; i++) { | ||
3173 | u32 ioff; | ||
3174 | |||
3175 | item = btrfs_item_nr(leaf, i); | ||
3176 | if (!leaf->map_token) { | ||
3177 | map_extent_buffer(leaf, (unsigned long)item, | ||
3178 | sizeof(struct btrfs_item), | ||
3179 | &leaf->map_token, &leaf->kaddr, | ||
3180 | &leaf->map_start, &leaf->map_len, | ||
3181 | KM_USER1); | ||
3182 | } | ||
3183 | |||
3184 | ioff = btrfs_item_offset(leaf, item); | ||
3185 | btrfs_set_item_offset(leaf, item, ioff - total_data); | ||
3186 | } | ||
3187 | if (leaf->map_token) { | ||
3188 | unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); | ||
3189 | leaf->map_token = NULL; | ||
3190 | } | ||
3191 | |||
3192 | /* shift the items */ | ||
3193 | memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), | ||
3194 | btrfs_item_nr_offset(slot), | ||
3195 | (nritems - slot) * sizeof(struct btrfs_item)); | ||
3196 | |||
3197 | /* shift the data */ | ||
3198 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
3199 | data_end - total_data, btrfs_leaf_data(leaf) + | ||
3200 | data_end, old_data - data_end); | ||
3201 | data_end = old_data; | ||
3202 | } else { | ||
3203 | /* | ||
3204 | * this sucks but it has to be done, if we are inserting at | ||
3205 | * the end of the leaf only insert 1 of the items, since we | ||
3206 | * have no way of knowing whats on the next leaf and we'd have | ||
3207 | * to drop our current locks to figure it out | ||
3208 | */ | ||
3209 | nr = 1; | ||
3210 | } | ||
3211 | |||
3212 | /* setup the item for the new data */ | ||
3213 | for (i = 0; i < nr; i++) { | ||
3214 | btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); | ||
3215 | btrfs_set_item_key(leaf, &disk_key, slot + i); | ||
3216 | item = btrfs_item_nr(leaf, slot + i); | ||
3217 | btrfs_set_item_offset(leaf, item, data_end - data_size[i]); | ||
3218 | data_end -= data_size[i]; | ||
3219 | btrfs_set_item_size(leaf, item, data_size[i]); | ||
3220 | } | ||
3221 | btrfs_set_header_nritems(leaf, nritems + nr); | ||
3222 | btrfs_mark_buffer_dirty(leaf); | ||
3223 | |||
3224 | ret = 0; | ||
3225 | if (slot == 0) { | ||
3226 | btrfs_cpu_key_to_disk(&disk_key, cpu_key); | ||
3227 | ret = fixup_low_keys(trans, root, path, &disk_key, 1); | ||
3228 | } | ||
3229 | |||
3230 | if (btrfs_leaf_free_space(root, leaf) < 0) { | ||
3231 | btrfs_print_leaf(root, leaf); | ||
3232 | BUG(); | ||
3233 | } | ||
3234 | out: | ||
3235 | if (!ret) | ||
3236 | ret = nr; | ||
3237 | return ret; | ||
3238 | } | ||
3239 | |||
3240 | /* | ||
3241 | * Given a key and some data, insert items into the tree. | ||
3242 | * This does all the path init required, making room in the tree if needed. | ||
3243 | */ | ||
3244 | int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, | ||
3245 | struct btrfs_root *root, | ||
3246 | struct btrfs_path *path, | ||
3247 | struct btrfs_key *cpu_key, u32 *data_size, | ||
3248 | int nr) | ||
3249 | { | ||
3250 | struct extent_buffer *leaf; | ||
3251 | struct btrfs_item *item; | ||
3252 | int ret = 0; | ||
3253 | int slot; | ||
3254 | int slot_orig; | ||
3255 | int i; | ||
3256 | u32 nritems; | ||
3257 | u32 total_size = 0; | ||
3258 | u32 total_data = 0; | ||
3259 | unsigned int data_end; | ||
3260 | struct btrfs_disk_key disk_key; | ||
3261 | |||
3262 | for (i = 0; i < nr; i++) | ||
3263 | total_data += data_size[i]; | ||
3264 | |||
3265 | total_size = total_data + (nr * sizeof(struct btrfs_item)); | ||
3266 | ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); | ||
3267 | if (ret == 0) | ||
3268 | return -EEXIST; | ||
3269 | if (ret < 0) | ||
3270 | goto out; | ||
3271 | |||
3272 | slot_orig = path->slots[0]; | ||
3273 | leaf = path->nodes[0]; | ||
3274 | |||
3275 | nritems = btrfs_header_nritems(leaf); | ||
3276 | data_end = leaf_data_end(root, leaf); | ||
3277 | |||
3278 | if (btrfs_leaf_free_space(root, leaf) < total_size) { | ||
3279 | btrfs_print_leaf(root, leaf); | ||
3280 | printk(KERN_CRIT "not enough freespace need %u have %d\n", | ||
3281 | total_size, btrfs_leaf_free_space(root, leaf)); | ||
3282 | BUG(); | ||
3283 | } | ||
3284 | |||
3285 | slot = path->slots[0]; | ||
3286 | BUG_ON(slot < 0); | ||
3287 | |||
3288 | if (slot != nritems) { | ||
3289 | unsigned int old_data = btrfs_item_end_nr(leaf, slot); | ||
3290 | |||
3291 | if (old_data < data_end) { | ||
3292 | btrfs_print_leaf(root, leaf); | ||
3293 | printk(KERN_CRIT "slot %d old_data %d data_end %d\n", | ||
3294 | slot, old_data, data_end); | ||
3295 | BUG_ON(1); | ||
3296 | } | ||
3297 | /* | ||
3298 | * item0..itemN ... dataN.offset..dataN.size .. data0.size | ||
3299 | */ | ||
3300 | /* first correct the data pointers */ | ||
3301 | WARN_ON(leaf->map_token); | ||
3302 | for (i = slot; i < nritems; i++) { | ||
3303 | u32 ioff; | ||
3304 | |||
3305 | item = btrfs_item_nr(leaf, i); | ||
3306 | if (!leaf->map_token) { | ||
3307 | map_extent_buffer(leaf, (unsigned long)item, | ||
3308 | sizeof(struct btrfs_item), | ||
3309 | &leaf->map_token, &leaf->kaddr, | ||
3310 | &leaf->map_start, &leaf->map_len, | ||
3311 | KM_USER1); | ||
3312 | } | ||
3313 | |||
3314 | ioff = btrfs_item_offset(leaf, item); | ||
3315 | btrfs_set_item_offset(leaf, item, ioff - total_data); | ||
3316 | } | ||
3317 | if (leaf->map_token) { | ||
3318 | unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); | ||
3319 | leaf->map_token = NULL; | ||
3320 | } | ||
3321 | |||
3322 | /* shift the items */ | ||
3323 | memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), | ||
3324 | btrfs_item_nr_offset(slot), | ||
3325 | (nritems - slot) * sizeof(struct btrfs_item)); | ||
3326 | |||
3327 | /* shift the data */ | ||
3328 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
3329 | data_end - total_data, btrfs_leaf_data(leaf) + | ||
3330 | data_end, old_data - data_end); | ||
3331 | data_end = old_data; | ||
3332 | } | ||
3333 | |||
3334 | /* setup the item for the new data */ | ||
3335 | for (i = 0; i < nr; i++) { | ||
3336 | btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); | ||
3337 | btrfs_set_item_key(leaf, &disk_key, slot + i); | ||
3338 | item = btrfs_item_nr(leaf, slot + i); | ||
3339 | btrfs_set_item_offset(leaf, item, data_end - data_size[i]); | ||
3340 | data_end -= data_size[i]; | ||
3341 | btrfs_set_item_size(leaf, item, data_size[i]); | ||
3342 | } | ||
3343 | btrfs_set_header_nritems(leaf, nritems + nr); | ||
3344 | btrfs_mark_buffer_dirty(leaf); | ||
3345 | |||
3346 | ret = 0; | ||
3347 | if (slot == 0) { | ||
3348 | btrfs_cpu_key_to_disk(&disk_key, cpu_key); | ||
3349 | ret = fixup_low_keys(trans, root, path, &disk_key, 1); | ||
3350 | } | ||
3351 | |||
3352 | if (btrfs_leaf_free_space(root, leaf) < 0) { | ||
3353 | btrfs_print_leaf(root, leaf); | ||
3354 | BUG(); | ||
3355 | } | ||
3356 | out: | ||
3357 | return ret; | ||
3358 | } | ||
3359 | |||
3360 | /* | ||
3361 | * Given a key and some data, insert an item into the tree. | ||
3362 | * This does all the path init required, making room in the tree if needed. | ||
3363 | */ | ||
3364 | int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root | ||
3365 | *root, struct btrfs_key *cpu_key, void *data, u32 | ||
3366 | data_size) | ||
3367 | { | ||
3368 | int ret = 0; | ||
3369 | struct btrfs_path *path; | ||
3370 | struct extent_buffer *leaf; | ||
3371 | unsigned long ptr; | ||
3372 | |||
3373 | path = btrfs_alloc_path(); | ||
3374 | BUG_ON(!path); | ||
3375 | ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); | ||
3376 | if (!ret) { | ||
3377 | leaf = path->nodes[0]; | ||
3378 | ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
3379 | write_extent_buffer(leaf, data, ptr, data_size); | ||
3380 | btrfs_mark_buffer_dirty(leaf); | ||
3381 | } | ||
3382 | btrfs_free_path(path); | ||
3383 | return ret; | ||
3384 | } | ||
3385 | |||
3386 | /* | ||
3387 | * delete the pointer from a given node. | ||
3388 | * | ||
3389 | * the tree should have been previously balanced so the deletion does not | ||
3390 | * empty a node. | ||
3391 | */ | ||
3392 | static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
3393 | struct btrfs_path *path, int level, int slot) | ||
3394 | { | ||
3395 | struct extent_buffer *parent = path->nodes[level]; | ||
3396 | u32 nritems; | ||
3397 | int ret = 0; | ||
3398 | int wret; | ||
3399 | |||
3400 | nritems = btrfs_header_nritems(parent); | ||
3401 | if (slot != nritems - 1) { | ||
3402 | memmove_extent_buffer(parent, | ||
3403 | btrfs_node_key_ptr_offset(slot), | ||
3404 | btrfs_node_key_ptr_offset(slot + 1), | ||
3405 | sizeof(struct btrfs_key_ptr) * | ||
3406 | (nritems - slot - 1)); | ||
3407 | } | ||
3408 | nritems--; | ||
3409 | btrfs_set_header_nritems(parent, nritems); | ||
3410 | if (nritems == 0 && parent == root->node) { | ||
3411 | BUG_ON(btrfs_header_level(root->node) != 1); | ||
3412 | /* just turn the root into a leaf and break */ | ||
3413 | btrfs_set_header_level(root->node, 0); | ||
3414 | } else if (slot == 0) { | ||
3415 | struct btrfs_disk_key disk_key; | ||
3416 | |||
3417 | btrfs_node_key(parent, &disk_key, 0); | ||
3418 | wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); | ||
3419 | if (wret) | ||
3420 | ret = wret; | ||
3421 | } | ||
3422 | btrfs_mark_buffer_dirty(parent); | ||
3423 | return ret; | ||
3424 | } | ||
3425 | |||
3426 | /* | ||
3427 | * a helper function to delete the leaf pointed to by path->slots[1] and | ||
3428 | * path->nodes[1]. bytenr is the node block pointer, but since the callers | ||
3429 | * already know it, it is faster to have them pass it down than to | ||
3430 | * read it out of the node again. | ||
3431 | * | ||
3432 | * This deletes the pointer in path->nodes[1] and frees the leaf | ||
3433 | * block extent. zero is returned if it all worked out, < 0 otherwise. | ||
3434 | * | ||
3435 | * The path must have already been setup for deleting the leaf, including | ||
3436 | * all the proper balancing. path->nodes[1] must be locked. | ||
3437 | */ | ||
3438 | noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, | ||
3439 | struct btrfs_root *root, | ||
3440 | struct btrfs_path *path, u64 bytenr) | ||
3441 | { | ||
3442 | int ret; | ||
3443 | u64 root_gen = btrfs_header_generation(path->nodes[1]); | ||
3444 | |||
3445 | ret = del_ptr(trans, root, path, 1, path->slots[1]); | ||
3446 | if (ret) | ||
3447 | return ret; | ||
3448 | |||
3449 | ret = btrfs_free_extent(trans, root, bytenr, | ||
3450 | btrfs_level_size(root, 0), | ||
3451 | path->nodes[1]->start, | ||
3452 | btrfs_header_owner(path->nodes[1]), | ||
3453 | root_gen, 0, 1); | ||
3454 | return ret; | ||
3455 | } | ||
3456 | /* | ||
3457 | * delete the item at the leaf level in path. If that empties | ||
3458 | * the leaf, remove it from the tree | ||
3459 | */ | ||
3460 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
3461 | struct btrfs_path *path, int slot, int nr) | ||
3462 | { | ||
3463 | struct extent_buffer *leaf; | ||
3464 | struct btrfs_item *item; | ||
3465 | int last_off; | ||
3466 | int dsize = 0; | ||
3467 | int ret = 0; | ||
3468 | int wret; | ||
3469 | int i; | ||
3470 | u32 nritems; | ||
3471 | |||
3472 | leaf = path->nodes[0]; | ||
3473 | last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); | ||
3474 | |||
3475 | for (i = 0; i < nr; i++) | ||
3476 | dsize += btrfs_item_size_nr(leaf, slot + i); | ||
3477 | |||
3478 | nritems = btrfs_header_nritems(leaf); | ||
3479 | |||
3480 | if (slot + nr != nritems) { | ||
3481 | int data_end = leaf_data_end(root, leaf); | ||
3482 | |||
3483 | memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + | ||
3484 | data_end + dsize, | ||
3485 | btrfs_leaf_data(leaf) + data_end, | ||
3486 | last_off - data_end); | ||
3487 | |||
3488 | for (i = slot + nr; i < nritems; i++) { | ||
3489 | u32 ioff; | ||
3490 | |||
3491 | item = btrfs_item_nr(leaf, i); | ||
3492 | if (!leaf->map_token) { | ||
3493 | map_extent_buffer(leaf, (unsigned long)item, | ||
3494 | sizeof(struct btrfs_item), | ||
3495 | &leaf->map_token, &leaf->kaddr, | ||
3496 | &leaf->map_start, &leaf->map_len, | ||
3497 | KM_USER1); | ||
3498 | } | ||
3499 | ioff = btrfs_item_offset(leaf, item); | ||
3500 | btrfs_set_item_offset(leaf, item, ioff + dsize); | ||
3501 | } | ||
3502 | |||
3503 | if (leaf->map_token) { | ||
3504 | unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); | ||
3505 | leaf->map_token = NULL; | ||
3506 | } | ||
3507 | |||
3508 | memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), | ||
3509 | btrfs_item_nr_offset(slot + nr), | ||
3510 | sizeof(struct btrfs_item) * | ||
3511 | (nritems - slot - nr)); | ||
3512 | } | ||
3513 | btrfs_set_header_nritems(leaf, nritems - nr); | ||
3514 | nritems -= nr; | ||
3515 | |||
3516 | /* delete the leaf if we've emptied it */ | ||
3517 | if (nritems == 0) { | ||
3518 | if (leaf == root->node) { | ||
3519 | btrfs_set_header_level(leaf, 0); | ||
3520 | } else { | ||
3521 | ret = btrfs_del_leaf(trans, root, path, leaf->start); | ||
3522 | BUG_ON(ret); | ||
3523 | } | ||
3524 | } else { | ||
3525 | int used = leaf_space_used(leaf, 0, nritems); | ||
3526 | if (slot == 0) { | ||
3527 | struct btrfs_disk_key disk_key; | ||
3528 | |||
3529 | btrfs_item_key(leaf, &disk_key, 0); | ||
3530 | wret = fixup_low_keys(trans, root, path, | ||
3531 | &disk_key, 1); | ||
3532 | if (wret) | ||
3533 | ret = wret; | ||
3534 | } | ||
3535 | |||
3536 | /* delete the leaf if it is mostly empty */ | ||
3537 | if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { | ||
3538 | /* push_leaf_left fixes the path. | ||
3539 | * make sure the path still points to our leaf | ||
3540 | * for possible call to del_ptr below | ||
3541 | */ | ||
3542 | slot = path->slots[1]; | ||
3543 | extent_buffer_get(leaf); | ||
3544 | |||
3545 | wret = push_leaf_left(trans, root, path, 1, 1); | ||
3546 | if (wret < 0 && wret != -ENOSPC) | ||
3547 | ret = wret; | ||
3548 | |||
3549 | if (path->nodes[0] == leaf && | ||
3550 | btrfs_header_nritems(leaf)) { | ||
3551 | wret = push_leaf_right(trans, root, path, 1, 1); | ||
3552 | if (wret < 0 && wret != -ENOSPC) | ||
3553 | ret = wret; | ||
3554 | } | ||
3555 | |||
3556 | if (btrfs_header_nritems(leaf) == 0) { | ||
3557 | path->slots[1] = slot; | ||
3558 | ret = btrfs_del_leaf(trans, root, path, | ||
3559 | leaf->start); | ||
3560 | BUG_ON(ret); | ||
3561 | free_extent_buffer(leaf); | ||
3562 | } else { | ||
3563 | /* if we're still in the path, make sure | ||
3564 | * we're dirty. Otherwise, one of the | ||
3565 | * push_leaf functions must have already | ||
3566 | * dirtied this buffer | ||
3567 | */ | ||
3568 | if (path->nodes[0] == leaf) | ||
3569 | btrfs_mark_buffer_dirty(leaf); | ||
3570 | free_extent_buffer(leaf); | ||
3571 | } | ||
3572 | } else { | ||
3573 | btrfs_mark_buffer_dirty(leaf); | ||
3574 | } | ||
3575 | } | ||
3576 | return ret; | ||
3577 | } | ||
3578 | |||
3579 | /* | ||
3580 | * search the tree again to find a leaf with lesser keys | ||
3581 | * returns 0 if it found something or 1 if there are no lesser leaves. | ||
3582 | * returns < 0 on io errors. | ||
3583 | * | ||
3584 | * This may release the path, and so you may lose any locks held at the | ||
3585 | * time you call it. | ||
3586 | */ | ||
3587 | int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) | ||
3588 | { | ||
3589 | struct btrfs_key key; | ||
3590 | struct btrfs_disk_key found_key; | ||
3591 | int ret; | ||
3592 | |||
3593 | btrfs_item_key_to_cpu(path->nodes[0], &key, 0); | ||
3594 | |||
3595 | if (key.offset > 0) | ||
3596 | key.offset--; | ||
3597 | else if (key.type > 0) | ||
3598 | key.type--; | ||
3599 | else if (key.objectid > 0) | ||
3600 | key.objectid--; | ||
3601 | else | ||
3602 | return 1; | ||
3603 | |||
3604 | btrfs_release_path(root, path); | ||
3605 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3606 | if (ret < 0) | ||
3607 | return ret; | ||
3608 | btrfs_item_key(path->nodes[0], &found_key, 0); | ||
3609 | ret = comp_keys(&found_key, &key); | ||
3610 | if (ret < 0) | ||
3611 | return 0; | ||
3612 | return 1; | ||
3613 | } | ||
3614 | |||
3615 | /* | ||
3616 | * A helper function to walk down the tree starting at min_key, and looking | ||
3617 | * for nodes or leaves that are either in cache or have a minimum | ||
3618 | * transaction id. This is used by the btree defrag code, and tree logging | ||
3619 | * | ||
3620 | * This does not cow, but it does stuff the starting key it finds back | ||
3621 | * into min_key, so you can call btrfs_search_slot with cow=1 on the | ||
3622 | * key and get a writable path. | ||
3623 | * | ||
3624 | * This does lock as it descends, and path->keep_locks should be set | ||
3625 | * to 1 by the caller. | ||
3626 | * | ||
3627 | * This honors path->lowest_level to prevent descent past a given level | ||
3628 | * of the tree. | ||
3629 | * | ||
3630 | * min_trans indicates the oldest transaction that you are interested | ||
3631 | * in walking through. Any nodes or leaves older than min_trans are | ||
3632 | * skipped over (without reading them). | ||
3633 | * | ||
3634 | * returns zero if something useful was found, < 0 on error and 1 if there | ||
3635 | * was nothing in the tree that matched the search criteria. | ||
3636 | */ | ||
3637 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | ||
3638 | struct btrfs_key *max_key, | ||
3639 | struct btrfs_path *path, int cache_only, | ||
3640 | u64 min_trans) | ||
3641 | { | ||
3642 | struct extent_buffer *cur; | ||
3643 | struct btrfs_key found_key; | ||
3644 | int slot; | ||
3645 | int sret; | ||
3646 | u32 nritems; | ||
3647 | int level; | ||
3648 | int ret = 1; | ||
3649 | |||
3650 | WARN_ON(!path->keep_locks); | ||
3651 | again: | ||
3652 | cur = btrfs_lock_root_node(root); | ||
3653 | level = btrfs_header_level(cur); | ||
3654 | WARN_ON(path->nodes[level]); | ||
3655 | path->nodes[level] = cur; | ||
3656 | path->locks[level] = 1; | ||
3657 | |||
3658 | if (btrfs_header_generation(cur) < min_trans) { | ||
3659 | ret = 1; | ||
3660 | goto out; | ||
3661 | } | ||
3662 | while (1) { | ||
3663 | nritems = btrfs_header_nritems(cur); | ||
3664 | level = btrfs_header_level(cur); | ||
3665 | sret = bin_search(cur, min_key, level, &slot); | ||
3666 | |||
3667 | /* at the lowest level, we're done, setup the path and exit */ | ||
3668 | if (level == path->lowest_level) { | ||
3669 | if (slot >= nritems) | ||
3670 | goto find_next_key; | ||
3671 | ret = 0; | ||
3672 | path->slots[level] = slot; | ||
3673 | btrfs_item_key_to_cpu(cur, &found_key, slot); | ||
3674 | goto out; | ||
3675 | } | ||
3676 | if (sret && slot > 0) | ||
3677 | slot--; | ||
3678 | /* | ||
3679 | * check this node pointer against the cache_only and | ||
3680 | * min_trans parameters. If it isn't in cache or is too | ||
3681 | * old, skip to the next one. | ||
3682 | */ | ||
3683 | while (slot < nritems) { | ||
3684 | u64 blockptr; | ||
3685 | u64 gen; | ||
3686 | struct extent_buffer *tmp; | ||
3687 | struct btrfs_disk_key disk_key; | ||
3688 | |||
3689 | blockptr = btrfs_node_blockptr(cur, slot); | ||
3690 | gen = btrfs_node_ptr_generation(cur, slot); | ||
3691 | if (gen < min_trans) { | ||
3692 | slot++; | ||
3693 | continue; | ||
3694 | } | ||
3695 | if (!cache_only) | ||
3696 | break; | ||
3697 | |||
3698 | if (max_key) { | ||
3699 | btrfs_node_key(cur, &disk_key, slot); | ||
3700 | if (comp_keys(&disk_key, max_key) >= 0) { | ||
3701 | ret = 1; | ||
3702 | goto out; | ||
3703 | } | ||
3704 | } | ||
3705 | |||
3706 | tmp = btrfs_find_tree_block(root, blockptr, | ||
3707 | btrfs_level_size(root, level - 1)); | ||
3708 | |||
3709 | if (tmp && btrfs_buffer_uptodate(tmp, gen)) { | ||
3710 | free_extent_buffer(tmp); | ||
3711 | break; | ||
3712 | } | ||
3713 | if (tmp) | ||
3714 | free_extent_buffer(tmp); | ||
3715 | slot++; | ||
3716 | } | ||
3717 | find_next_key: | ||
3718 | /* | ||
3719 | * we didn't find a candidate key in this node, walk forward | ||
3720 | * and find another one | ||
3721 | */ | ||
3722 | if (slot >= nritems) { | ||
3723 | path->slots[level] = slot; | ||
3724 | sret = btrfs_find_next_key(root, path, min_key, level, | ||
3725 | cache_only, min_trans); | ||
3726 | if (sret == 0) { | ||
3727 | btrfs_release_path(root, path); | ||
3728 | goto again; | ||
3729 | } else { | ||
3730 | goto out; | ||
3731 | } | ||
3732 | } | ||
3733 | /* save our key for returning back */ | ||
3734 | btrfs_node_key_to_cpu(cur, &found_key, slot); | ||
3735 | path->slots[level] = slot; | ||
3736 | if (level == path->lowest_level) { | ||
3737 | ret = 0; | ||
3738 | unlock_up(path, level, 1); | ||
3739 | goto out; | ||
3740 | } | ||
3741 | cur = read_node_slot(root, cur, slot); | ||
3742 | |||
3743 | btrfs_tree_lock(cur); | ||
3744 | path->locks[level - 1] = 1; | ||
3745 | path->nodes[level - 1] = cur; | ||
3746 | unlock_up(path, level, 1); | ||
3747 | } | ||
3748 | out: | ||
3749 | if (ret == 0) | ||
3750 | memcpy(min_key, &found_key, sizeof(found_key)); | ||
3751 | return ret; | ||
3752 | } | ||
3753 | |||
3754 | /* | ||
3755 | * this is similar to btrfs_next_leaf, but does not try to preserve | ||
3756 | * and fixup the path. It looks for and returns the next key in the | ||
3757 | * tree based on the current path and the cache_only and min_trans | ||
3758 | * parameters. | ||
3759 | * | ||
3760 | * 0 is returned if another key is found, < 0 if there are any errors | ||
3761 | * and 1 is returned if there are no higher keys in the tree | ||
3762 | * | ||
3763 | * path->keep_locks should be set to 1 on the search made before | ||
3764 | * calling this function. | ||
3765 | */ | ||
3766 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | ||
3767 | struct btrfs_key *key, int lowest_level, | ||
3768 | int cache_only, u64 min_trans) | ||
3769 | { | ||
3770 | int level = lowest_level; | ||
3771 | int slot; | ||
3772 | struct extent_buffer *c; | ||
3773 | |||
3774 | WARN_ON(!path->keep_locks); | ||
3775 | while (level < BTRFS_MAX_LEVEL) { | ||
3776 | if (!path->nodes[level]) | ||
3777 | return 1; | ||
3778 | |||
3779 | slot = path->slots[level] + 1; | ||
3780 | c = path->nodes[level]; | ||
3781 | next: | ||
3782 | if (slot >= btrfs_header_nritems(c)) { | ||
3783 | level++; | ||
3784 | if (level == BTRFS_MAX_LEVEL) | ||
3785 | return 1; | ||
3786 | continue; | ||
3787 | } | ||
3788 | if (level == 0) | ||
3789 | btrfs_item_key_to_cpu(c, key, slot); | ||
3790 | else { | ||
3791 | u64 blockptr = btrfs_node_blockptr(c, slot); | ||
3792 | u64 gen = btrfs_node_ptr_generation(c, slot); | ||
3793 | |||
3794 | if (cache_only) { | ||
3795 | struct extent_buffer *cur; | ||
3796 | cur = btrfs_find_tree_block(root, blockptr, | ||
3797 | btrfs_level_size(root, level - 1)); | ||
3798 | if (!cur || !btrfs_buffer_uptodate(cur, gen)) { | ||
3799 | slot++; | ||
3800 | if (cur) | ||
3801 | free_extent_buffer(cur); | ||
3802 | goto next; | ||
3803 | } | ||
3804 | free_extent_buffer(cur); | ||
3805 | } | ||
3806 | if (gen < min_trans) { | ||
3807 | slot++; | ||
3808 | goto next; | ||
3809 | } | ||
3810 | btrfs_node_key_to_cpu(c, key, slot); | ||
3811 | } | ||
3812 | return 0; | ||
3813 | } | ||
3814 | return 1; | ||
3815 | } | ||
3816 | |||
3817 | /* | ||
3818 | * search the tree again to find a leaf with greater keys | ||
3819 | * returns 0 if it found something or 1 if there are no greater leaves. | ||
3820 | * returns < 0 on io errors. | ||
3821 | */ | ||
3822 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) | ||
3823 | { | ||
3824 | int slot; | ||
3825 | int level = 1; | ||
3826 | struct extent_buffer *c; | ||
3827 | struct extent_buffer *next = NULL; | ||
3828 | struct btrfs_key key; | ||
3829 | u32 nritems; | ||
3830 | int ret; | ||
3831 | |||
3832 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
3833 | if (nritems == 0) | ||
3834 | return 1; | ||
3835 | |||
3836 | btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); | ||
3837 | |||
3838 | btrfs_release_path(root, path); | ||
3839 | path->keep_locks = 1; | ||
3840 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3841 | path->keep_locks = 0; | ||
3842 | |||
3843 | if (ret < 0) | ||
3844 | return ret; | ||
3845 | |||
3846 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
3847 | /* | ||
3848 | * by releasing the path above we dropped all our locks. A balance | ||
3849 | * could have added more items next to the key that used to be | ||
3850 | * at the very end of the block. So, check again here and | ||
3851 | * advance the path if there are now more items available. | ||
3852 | */ | ||
3853 | if (nritems > 0 && path->slots[0] < nritems - 1) { | ||
3854 | path->slots[0]++; | ||
3855 | goto done; | ||
3856 | } | ||
3857 | |||
3858 | while (level < BTRFS_MAX_LEVEL) { | ||
3859 | if (!path->nodes[level]) | ||
3860 | return 1; | ||
3861 | |||
3862 | slot = path->slots[level] + 1; | ||
3863 | c = path->nodes[level]; | ||
3864 | if (slot >= btrfs_header_nritems(c)) { | ||
3865 | level++; | ||
3866 | if (level == BTRFS_MAX_LEVEL) | ||
3867 | return 1; | ||
3868 | continue; | ||
3869 | } | ||
3870 | |||
3871 | if (next) { | ||
3872 | btrfs_tree_unlock(next); | ||
3873 | free_extent_buffer(next); | ||
3874 | } | ||
3875 | |||
3876 | if (level == 1 && (path->locks[1] || path->skip_locking) && | ||
3877 | path->reada) | ||
3878 | reada_for_search(root, path, level, slot, 0); | ||
3879 | |||
3880 | next = read_node_slot(root, c, slot); | ||
3881 | if (!path->skip_locking) { | ||
3882 | WARN_ON(!btrfs_tree_locked(c)); | ||
3883 | btrfs_tree_lock(next); | ||
3884 | } | ||
3885 | break; | ||
3886 | } | ||
3887 | path->slots[level] = slot; | ||
3888 | while (1) { | ||
3889 | level--; | ||
3890 | c = path->nodes[level]; | ||
3891 | if (path->locks[level]) | ||
3892 | btrfs_tree_unlock(c); | ||
3893 | free_extent_buffer(c); | ||
3894 | path->nodes[level] = next; | ||
3895 | path->slots[level] = 0; | ||
3896 | if (!path->skip_locking) | ||
3897 | path->locks[level] = 1; | ||
3898 | if (!level) | ||
3899 | break; | ||
3900 | if (level == 1 && path->locks[1] && path->reada) | ||
3901 | reada_for_search(root, path, level, slot, 0); | ||
3902 | next = read_node_slot(root, next, 0); | ||
3903 | if (!path->skip_locking) { | ||
3904 | WARN_ON(!btrfs_tree_locked(path->nodes[level])); | ||
3905 | btrfs_tree_lock(next); | ||
3906 | } | ||
3907 | } | ||
3908 | done: | ||
3909 | unlock_up(path, 0, 1); | ||
3910 | return 0; | ||
3911 | } | ||
3912 | |||
3913 | /* | ||
3914 | * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps | ||
3915 | * searching until it gets past min_objectid or finds an item of 'type' | ||
3916 | * | ||
3917 | * returns 0 if something is found, 1 if nothing was found and < 0 on error | ||
3918 | */ | ||
3919 | int btrfs_previous_item(struct btrfs_root *root, | ||
3920 | struct btrfs_path *path, u64 min_objectid, | ||
3921 | int type) | ||
3922 | { | ||
3923 | struct btrfs_key found_key; | ||
3924 | struct extent_buffer *leaf; | ||
3925 | u32 nritems; | ||
3926 | int ret; | ||
3927 | |||
3928 | while (1) { | ||
3929 | if (path->slots[0] == 0) { | ||
3930 | ret = btrfs_prev_leaf(root, path); | ||
3931 | if (ret != 0) | ||
3932 | return ret; | ||
3933 | } else { | ||
3934 | path->slots[0]--; | ||
3935 | } | ||
3936 | leaf = path->nodes[0]; | ||
3937 | nritems = btrfs_header_nritems(leaf); | ||
3938 | if (nritems == 0) | ||
3939 | return 1; | ||
3940 | if (path->slots[0] == nritems) | ||
3941 | path->slots[0]--; | ||
3942 | |||
3943 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
3944 | if (found_key.type == type) | ||
3945 | return 0; | ||
3946 | if (found_key.objectid < min_objectid) | ||
3947 | break; | ||
3948 | if (found_key.objectid == min_objectid && | ||
3949 | found_key.type < type) | ||
3950 | break; | ||
3951 | } | ||
3952 | return 1; | ||
3953 | } | ||
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 000000000000..eee060f88113 --- /dev/null +++ b/fs/btrfs/ctree.h | |||
@@ -0,0 +1,2129 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_CTREE__ | ||
20 | #define __BTRFS_CTREE__ | ||
21 | |||
22 | #include <linux/version.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/completion.h> | ||
27 | #include <linux/backing-dev.h> | ||
28 | #include <linux/wait.h> | ||
29 | #include <asm/kmap_types.h> | ||
30 | #include "extent_io.h" | ||
31 | #include "extent_map.h" | ||
32 | #include "async-thread.h" | ||
33 | |||
34 | struct btrfs_trans_handle; | ||
35 | struct btrfs_transaction; | ||
36 | extern struct kmem_cache *btrfs_trans_handle_cachep; | ||
37 | extern struct kmem_cache *btrfs_transaction_cachep; | ||
38 | extern struct kmem_cache *btrfs_bit_radix_cachep; | ||
39 | extern struct kmem_cache *btrfs_path_cachep; | ||
40 | struct btrfs_ordered_sum; | ||
41 | |||
42 | #define BTRFS_MAGIC "_BHRfS_M" | ||
43 | |||
44 | #define BTRFS_ACL_NOT_CACHED ((void *)-1) | ||
45 | |||
46 | #ifdef CONFIG_LOCKDEP | ||
47 | # define BTRFS_MAX_LEVEL 7 | ||
48 | #else | ||
49 | # define BTRFS_MAX_LEVEL 8 | ||
50 | #endif | ||
51 | |||
52 | /* holds pointers to all of the tree roots */ | ||
53 | #define BTRFS_ROOT_TREE_OBJECTID 1ULL | ||
54 | |||
55 | /* stores information about which extents are in use, and reference counts */ | ||
56 | #define BTRFS_EXTENT_TREE_OBJECTID 2ULL | ||
57 | |||
58 | /* | ||
59 | * chunk tree stores translations from logical -> physical block numbering | ||
60 | * the super block points to the chunk tree | ||
61 | */ | ||
62 | #define BTRFS_CHUNK_TREE_OBJECTID 3ULL | ||
63 | |||
64 | /* | ||
65 | * stores information about which areas of a given device are in use. | ||
66 | * one per device. The tree of tree roots points to the device tree | ||
67 | */ | ||
68 | #define BTRFS_DEV_TREE_OBJECTID 4ULL | ||
69 | |||
70 | /* one per subvolume, storing files and directories */ | ||
71 | #define BTRFS_FS_TREE_OBJECTID 5ULL | ||
72 | |||
73 | /* directory objectid inside the root tree */ | ||
74 | #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL | ||
75 | |||
76 | /* holds checksums of all the data extents */ | ||
77 | #define BTRFS_CSUM_TREE_OBJECTID 7ULL | ||
78 | |||
79 | /* orhpan objectid for tracking unlinked/truncated files */ | ||
80 | #define BTRFS_ORPHAN_OBJECTID -5ULL | ||
81 | |||
82 | /* does write ahead logging to speed up fsyncs */ | ||
83 | #define BTRFS_TREE_LOG_OBJECTID -6ULL | ||
84 | #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL | ||
85 | |||
86 | /* for space balancing */ | ||
87 | #define BTRFS_TREE_RELOC_OBJECTID -8ULL | ||
88 | #define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL | ||
89 | |||
90 | /* | ||
91 | * extent checksums all have this objectid | ||
92 | * this allows them to share the logging tree | ||
93 | * for fsyncs | ||
94 | */ | ||
95 | #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL | ||
96 | |||
97 | /* dummy objectid represents multiple objectids */ | ||
98 | #define BTRFS_MULTIPLE_OBJECTIDS -255ULL | ||
99 | |||
100 | /* | ||
101 | * All files have objectids in this range. | ||
102 | */ | ||
103 | #define BTRFS_FIRST_FREE_OBJECTID 256ULL | ||
104 | #define BTRFS_LAST_FREE_OBJECTID -256ULL | ||
105 | #define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL | ||
106 | |||
107 | |||
108 | /* | ||
109 | * the device items go into the chunk tree. The key is in the form | ||
110 | * [ 1 BTRFS_DEV_ITEM_KEY device_id ] | ||
111 | */ | ||
112 | #define BTRFS_DEV_ITEMS_OBJECTID 1ULL | ||
113 | |||
114 | /* | ||
115 | * we can actually store much bigger names, but lets not confuse the rest | ||
116 | * of linux | ||
117 | */ | ||
118 | #define BTRFS_NAME_LEN 255 | ||
119 | |||
120 | /* 32 bytes in various csum fields */ | ||
121 | #define BTRFS_CSUM_SIZE 32 | ||
122 | |||
123 | /* csum types */ | ||
124 | #define BTRFS_CSUM_TYPE_CRC32 0 | ||
125 | |||
126 | static int btrfs_csum_sizes[] = { 4, 0 }; | ||
127 | |||
128 | /* four bytes for CRC32 */ | ||
129 | #define BTRFS_EMPTY_DIR_SIZE 0 | ||
130 | |||
131 | #define BTRFS_FT_UNKNOWN 0 | ||
132 | #define BTRFS_FT_REG_FILE 1 | ||
133 | #define BTRFS_FT_DIR 2 | ||
134 | #define BTRFS_FT_CHRDEV 3 | ||
135 | #define BTRFS_FT_BLKDEV 4 | ||
136 | #define BTRFS_FT_FIFO 5 | ||
137 | #define BTRFS_FT_SOCK 6 | ||
138 | #define BTRFS_FT_SYMLINK 7 | ||
139 | #define BTRFS_FT_XATTR 8 | ||
140 | #define BTRFS_FT_MAX 9 | ||
141 | |||
142 | /* | ||
143 | * the key defines the order in the tree, and so it also defines (optimal) | ||
144 | * block layout. objectid corresonds to the inode number. The flags | ||
145 | * tells us things about the object, and is a kind of stream selector. | ||
146 | * so for a given inode, keys with flags of 1 might refer to the inode | ||
147 | * data, flags of 2 may point to file data in the btree and flags == 3 | ||
148 | * may point to extents. | ||
149 | * | ||
150 | * offset is the starting byte offset for this key in the stream. | ||
151 | * | ||
152 | * btrfs_disk_key is in disk byte order. struct btrfs_key is always | ||
153 | * in cpu native order. Otherwise they are identical and their sizes | ||
154 | * should be the same (ie both packed) | ||
155 | */ | ||
156 | struct btrfs_disk_key { | ||
157 | __le64 objectid; | ||
158 | u8 type; | ||
159 | __le64 offset; | ||
160 | } __attribute__ ((__packed__)); | ||
161 | |||
162 | struct btrfs_key { | ||
163 | u64 objectid; | ||
164 | u8 type; | ||
165 | u64 offset; | ||
166 | } __attribute__ ((__packed__)); | ||
167 | |||
168 | struct btrfs_mapping_tree { | ||
169 | struct extent_map_tree map_tree; | ||
170 | }; | ||
171 | |||
172 | #define BTRFS_UUID_SIZE 16 | ||
173 | struct btrfs_dev_item { | ||
174 | /* the internal btrfs device id */ | ||
175 | __le64 devid; | ||
176 | |||
177 | /* size of the device */ | ||
178 | __le64 total_bytes; | ||
179 | |||
180 | /* bytes used */ | ||
181 | __le64 bytes_used; | ||
182 | |||
183 | /* optimal io alignment for this device */ | ||
184 | __le32 io_align; | ||
185 | |||
186 | /* optimal io width for this device */ | ||
187 | __le32 io_width; | ||
188 | |||
189 | /* minimal io size for this device */ | ||
190 | __le32 sector_size; | ||
191 | |||
192 | /* type and info about this device */ | ||
193 | __le64 type; | ||
194 | |||
195 | /* expected generation for this device */ | ||
196 | __le64 generation; | ||
197 | |||
198 | /* | ||
199 | * starting byte of this partition on the device, | ||
200 | * to allowr for stripe alignment in the future | ||
201 | */ | ||
202 | __le64 start_offset; | ||
203 | |||
204 | /* grouping information for allocation decisions */ | ||
205 | __le32 dev_group; | ||
206 | |||
207 | /* seek speed 0-100 where 100 is fastest */ | ||
208 | u8 seek_speed; | ||
209 | |||
210 | /* bandwidth 0-100 where 100 is fastest */ | ||
211 | u8 bandwidth; | ||
212 | |||
213 | /* btrfs generated uuid for this device */ | ||
214 | u8 uuid[BTRFS_UUID_SIZE]; | ||
215 | |||
216 | /* uuid of FS who owns this device */ | ||
217 | u8 fsid[BTRFS_UUID_SIZE]; | ||
218 | } __attribute__ ((__packed__)); | ||
219 | |||
220 | struct btrfs_stripe { | ||
221 | __le64 devid; | ||
222 | __le64 offset; | ||
223 | u8 dev_uuid[BTRFS_UUID_SIZE]; | ||
224 | } __attribute__ ((__packed__)); | ||
225 | |||
226 | struct btrfs_chunk { | ||
227 | /* size of this chunk in bytes */ | ||
228 | __le64 length; | ||
229 | |||
230 | /* objectid of the root referencing this chunk */ | ||
231 | __le64 owner; | ||
232 | |||
233 | __le64 stripe_len; | ||
234 | __le64 type; | ||
235 | |||
236 | /* optimal io alignment for this chunk */ | ||
237 | __le32 io_align; | ||
238 | |||
239 | /* optimal io width for this chunk */ | ||
240 | __le32 io_width; | ||
241 | |||
242 | /* minimal io size for this chunk */ | ||
243 | __le32 sector_size; | ||
244 | |||
245 | /* 2^16 stripes is quite a lot, a second limit is the size of a single | ||
246 | * item in the btree | ||
247 | */ | ||
248 | __le16 num_stripes; | ||
249 | |||
250 | /* sub stripes only matter for raid10 */ | ||
251 | __le16 sub_stripes; | ||
252 | struct btrfs_stripe stripe; | ||
253 | /* additional stripes go here */ | ||
254 | } __attribute__ ((__packed__)); | ||
255 | |||
256 | static inline unsigned long btrfs_chunk_item_size(int num_stripes) | ||
257 | { | ||
258 | BUG_ON(num_stripes == 0); | ||
259 | return sizeof(struct btrfs_chunk) + | ||
260 | sizeof(struct btrfs_stripe) * (num_stripes - 1); | ||
261 | } | ||
262 | |||
263 | #define BTRFS_FSID_SIZE 16 | ||
264 | #define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) | ||
265 | |||
266 | /* | ||
267 | * every tree block (leaf or node) starts with this header. | ||
268 | */ | ||
269 | struct btrfs_header { | ||
270 | /* these first four must match the super block */ | ||
271 | u8 csum[BTRFS_CSUM_SIZE]; | ||
272 | u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ | ||
273 | __le64 bytenr; /* which block this node is supposed to live in */ | ||
274 | __le64 flags; | ||
275 | |||
276 | /* allowed to be different from the super from here on down */ | ||
277 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; | ||
278 | __le64 generation; | ||
279 | __le64 owner; | ||
280 | __le32 nritems; | ||
281 | u8 level; | ||
282 | } __attribute__ ((__packed__)); | ||
283 | |||
284 | #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ | ||
285 | sizeof(struct btrfs_header)) / \ | ||
286 | sizeof(struct btrfs_key_ptr)) | ||
287 | #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) | ||
288 | #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) | ||
289 | #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ | ||
290 | sizeof(struct btrfs_item) - \ | ||
291 | sizeof(struct btrfs_file_extent_item)) | ||
292 | |||
293 | #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) | ||
294 | |||
295 | /* | ||
296 | * this is a very generous portion of the super block, giving us | ||
297 | * room to translate 14 chunks with 3 stripes each. | ||
298 | */ | ||
299 | #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 | ||
300 | #define BTRFS_LABEL_SIZE 256 | ||
301 | |||
302 | /* | ||
303 | * the super block basically lists the main trees of the FS | ||
304 | * it currently lacks any block count etc etc | ||
305 | */ | ||
306 | struct btrfs_super_block { | ||
307 | u8 csum[BTRFS_CSUM_SIZE]; | ||
308 | /* the first 4 fields must match struct btrfs_header */ | ||
309 | u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ | ||
310 | __le64 bytenr; /* this block number */ | ||
311 | __le64 flags; | ||
312 | |||
313 | /* allowed to be different from the btrfs_header from here own down */ | ||
314 | __le64 magic; | ||
315 | __le64 generation; | ||
316 | __le64 root; | ||
317 | __le64 chunk_root; | ||
318 | __le64 log_root; | ||
319 | |||
320 | /* this will help find the new super based on the log root */ | ||
321 | __le64 log_root_transid; | ||
322 | __le64 total_bytes; | ||
323 | __le64 bytes_used; | ||
324 | __le64 root_dir_objectid; | ||
325 | __le64 num_devices; | ||
326 | __le32 sectorsize; | ||
327 | __le32 nodesize; | ||
328 | __le32 leafsize; | ||
329 | __le32 stripesize; | ||
330 | __le32 sys_chunk_array_size; | ||
331 | __le64 chunk_root_generation; | ||
332 | __le64 compat_flags; | ||
333 | __le64 compat_ro_flags; | ||
334 | __le64 incompat_flags; | ||
335 | __le16 csum_type; | ||
336 | u8 root_level; | ||
337 | u8 chunk_root_level; | ||
338 | u8 log_root_level; | ||
339 | struct btrfs_dev_item dev_item; | ||
340 | |||
341 | char label[BTRFS_LABEL_SIZE]; | ||
342 | |||
343 | /* future expansion */ | ||
344 | __le64 reserved[32]; | ||
345 | u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; | ||
346 | } __attribute__ ((__packed__)); | ||
347 | |||
348 | /* | ||
349 | * Compat flags that we support. If any incompat flags are set other than the | ||
350 | * ones specified below then we will fail to mount | ||
351 | */ | ||
352 | #define BTRFS_FEATURE_COMPAT_SUPP 0x0 | ||
353 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 | ||
354 | #define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 | ||
355 | |||
356 | /* | ||
357 | * A leaf is full of items. offset and size tell us where to find | ||
358 | * the item in the leaf (relative to the start of the data area) | ||
359 | */ | ||
360 | struct btrfs_item { | ||
361 | struct btrfs_disk_key key; | ||
362 | __le32 offset; | ||
363 | __le32 size; | ||
364 | } __attribute__ ((__packed__)); | ||
365 | |||
366 | /* | ||
367 | * leaves have an item area and a data area: | ||
368 | * [item0, item1....itemN] [free space] [dataN...data1, data0] | ||
369 | * | ||
370 | * The data is separate from the items to get the keys closer together | ||
371 | * during searches. | ||
372 | */ | ||
373 | struct btrfs_leaf { | ||
374 | struct btrfs_header header; | ||
375 | struct btrfs_item items[]; | ||
376 | } __attribute__ ((__packed__)); | ||
377 | |||
378 | /* | ||
379 | * all non-leaf blocks are nodes, they hold only keys and pointers to | ||
380 | * other blocks | ||
381 | */ | ||
382 | struct btrfs_key_ptr { | ||
383 | struct btrfs_disk_key key; | ||
384 | __le64 blockptr; | ||
385 | __le64 generation; | ||
386 | } __attribute__ ((__packed__)); | ||
387 | |||
388 | struct btrfs_node { | ||
389 | struct btrfs_header header; | ||
390 | struct btrfs_key_ptr ptrs[]; | ||
391 | } __attribute__ ((__packed__)); | ||
392 | |||
393 | /* | ||
394 | * btrfs_paths remember the path taken from the root down to the leaf. | ||
395 | * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point | ||
396 | * to any other levels that are present. | ||
397 | * | ||
398 | * The slots array records the index of the item or block pointer | ||
399 | * used while walking the tree. | ||
400 | */ | ||
401 | struct btrfs_path { | ||
402 | struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; | ||
403 | int slots[BTRFS_MAX_LEVEL]; | ||
404 | /* if there is real range locking, this locks field will change */ | ||
405 | int locks[BTRFS_MAX_LEVEL]; | ||
406 | int reada; | ||
407 | /* keep some upper locks as we walk down */ | ||
408 | int keep_locks; | ||
409 | int skip_locking; | ||
410 | int lowest_level; | ||
411 | |||
412 | /* | ||
413 | * set by btrfs_split_item, tells search_slot to keep all locks | ||
414 | * and to force calls to keep space in the nodes | ||
415 | */ | ||
416 | int search_for_split; | ||
417 | }; | ||
418 | |||
419 | /* | ||
420 | * items in the extent btree are used to record the objectid of the | ||
421 | * owner of the block and the number of references | ||
422 | */ | ||
423 | struct btrfs_extent_item { | ||
424 | __le32 refs; | ||
425 | } __attribute__ ((__packed__)); | ||
426 | |||
427 | struct btrfs_extent_ref { | ||
428 | __le64 root; | ||
429 | __le64 generation; | ||
430 | __le64 objectid; | ||
431 | __le32 num_refs; | ||
432 | } __attribute__ ((__packed__)); | ||
433 | |||
434 | /* dev extents record free space on individual devices. The owner | ||
435 | * field points back to the chunk allocation mapping tree that allocated | ||
436 | * the extent. The chunk tree uuid field is a way to double check the owner | ||
437 | */ | ||
438 | struct btrfs_dev_extent { | ||
439 | __le64 chunk_tree; | ||
440 | __le64 chunk_objectid; | ||
441 | __le64 chunk_offset; | ||
442 | __le64 length; | ||
443 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; | ||
444 | } __attribute__ ((__packed__)); | ||
445 | |||
446 | struct btrfs_inode_ref { | ||
447 | __le64 index; | ||
448 | __le16 name_len; | ||
449 | /* name goes here */ | ||
450 | } __attribute__ ((__packed__)); | ||
451 | |||
452 | struct btrfs_timespec { | ||
453 | __le64 sec; | ||
454 | __le32 nsec; | ||
455 | } __attribute__ ((__packed__)); | ||
456 | |||
457 | typedef enum { | ||
458 | BTRFS_COMPRESS_NONE = 0, | ||
459 | BTRFS_COMPRESS_ZLIB = 1, | ||
460 | BTRFS_COMPRESS_LAST = 2, | ||
461 | } btrfs_compression_type; | ||
462 | |||
463 | /* we don't understand any encryption methods right now */ | ||
464 | typedef enum { | ||
465 | BTRFS_ENCRYPTION_NONE = 0, | ||
466 | BTRFS_ENCRYPTION_LAST = 1, | ||
467 | } btrfs_encryption_type; | ||
468 | |||
469 | struct btrfs_inode_item { | ||
470 | /* nfs style generation number */ | ||
471 | __le64 generation; | ||
472 | /* transid that last touched this inode */ | ||
473 | __le64 transid; | ||
474 | __le64 size; | ||
475 | __le64 nbytes; | ||
476 | __le64 block_group; | ||
477 | __le32 nlink; | ||
478 | __le32 uid; | ||
479 | __le32 gid; | ||
480 | __le32 mode; | ||
481 | __le64 rdev; | ||
482 | __le64 flags; | ||
483 | |||
484 | /* modification sequence number for NFS */ | ||
485 | __le64 sequence; | ||
486 | |||
487 | /* | ||
488 | * a little future expansion, for more than this we can | ||
489 | * just grow the inode item and version it | ||
490 | */ | ||
491 | __le64 reserved[4]; | ||
492 | struct btrfs_timespec atime; | ||
493 | struct btrfs_timespec ctime; | ||
494 | struct btrfs_timespec mtime; | ||
495 | struct btrfs_timespec otime; | ||
496 | } __attribute__ ((__packed__)); | ||
497 | |||
498 | struct btrfs_dir_log_item { | ||
499 | __le64 end; | ||
500 | } __attribute__ ((__packed__)); | ||
501 | |||
502 | struct btrfs_dir_item { | ||
503 | struct btrfs_disk_key location; | ||
504 | __le64 transid; | ||
505 | __le16 data_len; | ||
506 | __le16 name_len; | ||
507 | u8 type; | ||
508 | } __attribute__ ((__packed__)); | ||
509 | |||
510 | struct btrfs_root_item { | ||
511 | struct btrfs_inode_item inode; | ||
512 | __le64 generation; | ||
513 | __le64 root_dirid; | ||
514 | __le64 bytenr; | ||
515 | __le64 byte_limit; | ||
516 | __le64 bytes_used; | ||
517 | __le64 last_snapshot; | ||
518 | __le64 flags; | ||
519 | __le32 refs; | ||
520 | struct btrfs_disk_key drop_progress; | ||
521 | u8 drop_level; | ||
522 | u8 level; | ||
523 | } __attribute__ ((__packed__)); | ||
524 | |||
525 | /* | ||
526 | * this is used for both forward and backward root refs | ||
527 | */ | ||
528 | struct btrfs_root_ref { | ||
529 | __le64 dirid; | ||
530 | __le64 sequence; | ||
531 | __le16 name_len; | ||
532 | } __attribute__ ((__packed__)); | ||
533 | |||
534 | #define BTRFS_FILE_EXTENT_INLINE 0 | ||
535 | #define BTRFS_FILE_EXTENT_REG 1 | ||
536 | #define BTRFS_FILE_EXTENT_PREALLOC 2 | ||
537 | |||
538 | struct btrfs_file_extent_item { | ||
539 | /* | ||
540 | * transaction id that created this extent | ||
541 | */ | ||
542 | __le64 generation; | ||
543 | /* | ||
544 | * max number of bytes to hold this extent in ram | ||
545 | * when we split a compressed extent we can't know how big | ||
546 | * each of the resulting pieces will be. So, this is | ||
547 | * an upper limit on the size of the extent in ram instead of | ||
548 | * an exact limit. | ||
549 | */ | ||
550 | __le64 ram_bytes; | ||
551 | |||
552 | /* | ||
553 | * 32 bits for the various ways we might encode the data, | ||
554 | * including compression and encryption. If any of these | ||
555 | * are set to something a given disk format doesn't understand | ||
556 | * it is treated like an incompat flag for reading and writing, | ||
557 | * but not for stat. | ||
558 | */ | ||
559 | u8 compression; | ||
560 | u8 encryption; | ||
561 | __le16 other_encoding; /* spare for later use */ | ||
562 | |||
563 | /* are we inline data or a real extent? */ | ||
564 | u8 type; | ||
565 | |||
566 | /* | ||
567 | * disk space consumed by the extent, checksum blocks are included | ||
568 | * in these numbers | ||
569 | */ | ||
570 | __le64 disk_bytenr; | ||
571 | __le64 disk_num_bytes; | ||
572 | /* | ||
573 | * the logical offset in file blocks (no csums) | ||
574 | * this extent record is for. This allows a file extent to point | ||
575 | * into the middle of an existing extent on disk, sharing it | ||
576 | * between two snapshots (useful if some bytes in the middle of the | ||
577 | * extent have changed | ||
578 | */ | ||
579 | __le64 offset; | ||
580 | /* | ||
581 | * the logical number of file blocks (no csums included). This | ||
582 | * always reflects the size uncompressed and without encoding. | ||
583 | */ | ||
584 | __le64 num_bytes; | ||
585 | |||
586 | } __attribute__ ((__packed__)); | ||
587 | |||
588 | struct btrfs_csum_item { | ||
589 | u8 csum; | ||
590 | } __attribute__ ((__packed__)); | ||
591 | |||
592 | /* different types of block groups (and chunks) */ | ||
593 | #define BTRFS_BLOCK_GROUP_DATA (1 << 0) | ||
594 | #define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) | ||
595 | #define BTRFS_BLOCK_GROUP_METADATA (1 << 2) | ||
596 | #define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) | ||
597 | #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) | ||
598 | #define BTRFS_BLOCK_GROUP_DUP (1 << 5) | ||
599 | #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) | ||
600 | |||
601 | struct btrfs_block_group_item { | ||
602 | __le64 used; | ||
603 | __le64 chunk_objectid; | ||
604 | __le64 flags; | ||
605 | } __attribute__ ((__packed__)); | ||
606 | |||
607 | struct btrfs_space_info { | ||
608 | u64 flags; | ||
609 | u64 total_bytes; | ||
610 | u64 bytes_used; | ||
611 | u64 bytes_pinned; | ||
612 | u64 bytes_reserved; | ||
613 | u64 bytes_readonly; | ||
614 | int full; | ||
615 | int force_alloc; | ||
616 | struct list_head list; | ||
617 | |||
618 | /* for block groups in our same type */ | ||
619 | struct list_head block_groups; | ||
620 | spinlock_t lock; | ||
621 | struct rw_semaphore groups_sem; | ||
622 | }; | ||
623 | |||
624 | struct btrfs_free_space { | ||
625 | struct rb_node bytes_index; | ||
626 | struct rb_node offset_index; | ||
627 | u64 offset; | ||
628 | u64 bytes; | ||
629 | }; | ||
630 | |||
631 | struct btrfs_block_group_cache { | ||
632 | struct btrfs_key key; | ||
633 | struct btrfs_block_group_item item; | ||
634 | spinlock_t lock; | ||
635 | struct mutex alloc_mutex; | ||
636 | struct mutex cache_mutex; | ||
637 | u64 pinned; | ||
638 | u64 reserved; | ||
639 | u64 flags; | ||
640 | int cached; | ||
641 | int ro; | ||
642 | int dirty; | ||
643 | |||
644 | struct btrfs_space_info *space_info; | ||
645 | |||
646 | /* free space cache stuff */ | ||
647 | struct rb_root free_space_bytes; | ||
648 | struct rb_root free_space_offset; | ||
649 | |||
650 | /* block group cache stuff */ | ||
651 | struct rb_node cache_node; | ||
652 | |||
653 | /* for block groups in the same raid type */ | ||
654 | struct list_head list; | ||
655 | |||
656 | /* usage count */ | ||
657 | atomic_t count; | ||
658 | }; | ||
659 | |||
660 | struct btrfs_leaf_ref_tree { | ||
661 | struct rb_root root; | ||
662 | struct list_head list; | ||
663 | spinlock_t lock; | ||
664 | }; | ||
665 | |||
666 | struct btrfs_device; | ||
667 | struct btrfs_fs_devices; | ||
668 | struct btrfs_fs_info { | ||
669 | u8 fsid[BTRFS_FSID_SIZE]; | ||
670 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; | ||
671 | struct btrfs_root *extent_root; | ||
672 | struct btrfs_root *tree_root; | ||
673 | struct btrfs_root *chunk_root; | ||
674 | struct btrfs_root *dev_root; | ||
675 | struct btrfs_root *fs_root; | ||
676 | struct btrfs_root *csum_root; | ||
677 | |||
678 | /* the log root tree is a directory of all the other log roots */ | ||
679 | struct btrfs_root *log_root_tree; | ||
680 | struct radix_tree_root fs_roots_radix; | ||
681 | |||
682 | /* block group cache stuff */ | ||
683 | spinlock_t block_group_cache_lock; | ||
684 | struct rb_root block_group_cache_tree; | ||
685 | |||
686 | struct extent_io_tree pinned_extents; | ||
687 | struct extent_io_tree pending_del; | ||
688 | struct extent_io_tree extent_ins; | ||
689 | |||
690 | /* logical->physical extent mapping */ | ||
691 | struct btrfs_mapping_tree mapping_tree; | ||
692 | |||
693 | u64 generation; | ||
694 | u64 last_trans_committed; | ||
695 | u64 last_trans_new_blockgroup; | ||
696 | u64 open_ioctl_trans; | ||
697 | unsigned long mount_opt; | ||
698 | u64 max_extent; | ||
699 | u64 max_inline; | ||
700 | u64 alloc_start; | ||
701 | struct btrfs_transaction *running_transaction; | ||
702 | wait_queue_head_t transaction_throttle; | ||
703 | wait_queue_head_t transaction_wait; | ||
704 | |||
705 | wait_queue_head_t async_submit_wait; | ||
706 | wait_queue_head_t tree_log_wait; | ||
707 | |||
708 | struct btrfs_super_block super_copy; | ||
709 | struct btrfs_super_block super_for_commit; | ||
710 | struct block_device *__bdev; | ||
711 | struct super_block *sb; | ||
712 | struct inode *btree_inode; | ||
713 | struct backing_dev_info bdi; | ||
714 | spinlock_t hash_lock; | ||
715 | struct mutex trans_mutex; | ||
716 | struct mutex tree_log_mutex; | ||
717 | struct mutex transaction_kthread_mutex; | ||
718 | struct mutex cleaner_mutex; | ||
719 | struct mutex extent_ins_mutex; | ||
720 | struct mutex pinned_mutex; | ||
721 | struct mutex chunk_mutex; | ||
722 | struct mutex drop_mutex; | ||
723 | struct mutex volume_mutex; | ||
724 | struct mutex tree_reloc_mutex; | ||
725 | struct list_head trans_list; | ||
726 | struct list_head hashers; | ||
727 | struct list_head dead_roots; | ||
728 | |||
729 | atomic_t nr_async_submits; | ||
730 | atomic_t async_submit_draining; | ||
731 | atomic_t nr_async_bios; | ||
732 | atomic_t async_delalloc_pages; | ||
733 | atomic_t tree_log_writers; | ||
734 | atomic_t tree_log_commit; | ||
735 | unsigned long tree_log_batch; | ||
736 | u64 tree_log_transid; | ||
737 | |||
738 | /* | ||
739 | * this is used by the balancing code to wait for all the pending | ||
740 | * ordered extents | ||
741 | */ | ||
742 | spinlock_t ordered_extent_lock; | ||
743 | struct list_head ordered_extents; | ||
744 | struct list_head delalloc_inodes; | ||
745 | |||
746 | /* | ||
747 | * there is a pool of worker threads for checksumming during writes | ||
748 | * and a pool for checksumming after reads. This is because readers | ||
749 | * can run with FS locks held, and the writers may be waiting for | ||
750 | * those locks. We don't want ordering in the pending list to cause | ||
751 | * deadlocks, and so the two are serviced separately. | ||
752 | * | ||
753 | * A third pool does submit_bio to avoid deadlocking with the other | ||
754 | * two | ||
755 | */ | ||
756 | struct btrfs_workers workers; | ||
757 | struct btrfs_workers delalloc_workers; | ||
758 | struct btrfs_workers endio_workers; | ||
759 | struct btrfs_workers endio_meta_workers; | ||
760 | struct btrfs_workers endio_meta_write_workers; | ||
761 | struct btrfs_workers endio_write_workers; | ||
762 | struct btrfs_workers submit_workers; | ||
763 | /* | ||
764 | * fixup workers take dirty pages that didn't properly go through | ||
765 | * the cow mechanism and make them safe to write. It happens | ||
766 | * for the sys_munmap function call path | ||
767 | */ | ||
768 | struct btrfs_workers fixup_workers; | ||
769 | struct task_struct *transaction_kthread; | ||
770 | struct task_struct *cleaner_kthread; | ||
771 | int thread_pool_size; | ||
772 | |||
773 | /* tree relocation relocated fields */ | ||
774 | struct list_head dead_reloc_roots; | ||
775 | struct btrfs_leaf_ref_tree reloc_ref_tree; | ||
776 | struct btrfs_leaf_ref_tree shared_ref_tree; | ||
777 | |||
778 | struct kobject super_kobj; | ||
779 | struct completion kobj_unregister; | ||
780 | int do_barriers; | ||
781 | int closing; | ||
782 | int log_root_recovering; | ||
783 | atomic_t throttles; | ||
784 | atomic_t throttle_gen; | ||
785 | |||
786 | u64 total_pinned; | ||
787 | struct list_head dirty_cowonly_roots; | ||
788 | |||
789 | struct btrfs_fs_devices *fs_devices; | ||
790 | struct list_head space_info; | ||
791 | spinlock_t delalloc_lock; | ||
792 | spinlock_t new_trans_lock; | ||
793 | u64 delalloc_bytes; | ||
794 | u64 last_alloc; | ||
795 | u64 last_data_alloc; | ||
796 | |||
797 | spinlock_t ref_cache_lock; | ||
798 | u64 total_ref_cache_size; | ||
799 | |||
800 | u64 avail_data_alloc_bits; | ||
801 | u64 avail_metadata_alloc_bits; | ||
802 | u64 avail_system_alloc_bits; | ||
803 | u64 data_alloc_profile; | ||
804 | u64 metadata_alloc_profile; | ||
805 | u64 system_alloc_profile; | ||
806 | |||
807 | void *bdev_holder; | ||
808 | }; | ||
809 | |||
810 | /* | ||
811 | * in ram representation of the tree. extent_root is used for all allocations | ||
812 | * and for the extent tree extent_root root. | ||
813 | */ | ||
814 | struct btrfs_dirty_root; | ||
815 | struct btrfs_root { | ||
816 | struct extent_buffer *node; | ||
817 | |||
818 | /* the node lock is held while changing the node pointer */ | ||
819 | spinlock_t node_lock; | ||
820 | |||
821 | struct extent_buffer *commit_root; | ||
822 | struct btrfs_leaf_ref_tree *ref_tree; | ||
823 | struct btrfs_leaf_ref_tree ref_tree_struct; | ||
824 | struct btrfs_dirty_root *dirty_root; | ||
825 | struct btrfs_root *log_root; | ||
826 | struct btrfs_root *reloc_root; | ||
827 | |||
828 | struct btrfs_root_item root_item; | ||
829 | struct btrfs_key root_key; | ||
830 | struct btrfs_fs_info *fs_info; | ||
831 | struct extent_io_tree dirty_log_pages; | ||
832 | |||
833 | struct kobject root_kobj; | ||
834 | struct completion kobj_unregister; | ||
835 | struct mutex objectid_mutex; | ||
836 | struct mutex log_mutex; | ||
837 | |||
838 | u64 objectid; | ||
839 | u64 last_trans; | ||
840 | |||
841 | /* data allocations are done in sectorsize units */ | ||
842 | u32 sectorsize; | ||
843 | |||
844 | /* node allocations are done in nodesize units */ | ||
845 | u32 nodesize; | ||
846 | |||
847 | /* leaf allocations are done in leafsize units */ | ||
848 | u32 leafsize; | ||
849 | |||
850 | u32 stripesize; | ||
851 | |||
852 | u32 type; | ||
853 | u64 highest_inode; | ||
854 | u64 last_inode_alloc; | ||
855 | int ref_cows; | ||
856 | int track_dirty; | ||
857 | u64 defrag_trans_start; | ||
858 | struct btrfs_key defrag_progress; | ||
859 | struct btrfs_key defrag_max; | ||
860 | int defrag_running; | ||
861 | int defrag_level; | ||
862 | char *name; | ||
863 | int in_sysfs; | ||
864 | |||
865 | /* the dirty list is only used by non-reference counted roots */ | ||
866 | struct list_head dirty_list; | ||
867 | |||
868 | spinlock_t list_lock; | ||
869 | struct list_head dead_list; | ||
870 | struct list_head orphan_list; | ||
871 | |||
872 | /* | ||
873 | * right now this just gets used so that a root has its own devid | ||
874 | * for stat. It may be used for more later | ||
875 | */ | ||
876 | struct super_block anon_super; | ||
877 | }; | ||
878 | |||
879 | /* | ||
880 | |||
881 | * inode items have the data typically returned from stat and store other | ||
882 | * info about object characteristics. There is one for every file and dir in | ||
883 | * the FS | ||
884 | */ | ||
885 | #define BTRFS_INODE_ITEM_KEY 1 | ||
886 | #define BTRFS_INODE_REF_KEY 12 | ||
887 | #define BTRFS_XATTR_ITEM_KEY 24 | ||
888 | #define BTRFS_ORPHAN_ITEM_KEY 48 | ||
889 | /* reserve 2-15 close to the inode for later flexibility */ | ||
890 | |||
891 | /* | ||
892 | * dir items are the name -> inode pointers in a directory. There is one | ||
893 | * for every name in a directory. | ||
894 | */ | ||
895 | #define BTRFS_DIR_LOG_ITEM_KEY 60 | ||
896 | #define BTRFS_DIR_LOG_INDEX_KEY 72 | ||
897 | #define BTRFS_DIR_ITEM_KEY 84 | ||
898 | #define BTRFS_DIR_INDEX_KEY 96 | ||
899 | /* | ||
900 | * extent data is for file data | ||
901 | */ | ||
902 | #define BTRFS_EXTENT_DATA_KEY 108 | ||
903 | |||
904 | /* | ||
905 | * extent csums are stored in a separate tree and hold csums for | ||
906 | * an entire extent on disk. | ||
907 | */ | ||
908 | #define BTRFS_EXTENT_CSUM_KEY 128 | ||
909 | |||
910 | /* | ||
911 | * root items point to tree roots. There are typically in the root | ||
912 | * tree used by the super block to find all the other trees | ||
913 | */ | ||
914 | #define BTRFS_ROOT_ITEM_KEY 132 | ||
915 | |||
916 | /* | ||
917 | * root backrefs tie subvols and snapshots to the directory entries that | ||
918 | * reference them | ||
919 | */ | ||
920 | #define BTRFS_ROOT_BACKREF_KEY 144 | ||
921 | |||
922 | /* | ||
923 | * root refs make a fast index for listing all of the snapshots and | ||
924 | * subvolumes referenced by a given root. They point directly to the | ||
925 | * directory item in the root that references the subvol | ||
926 | */ | ||
927 | #define BTRFS_ROOT_REF_KEY 156 | ||
928 | |||
929 | /* | ||
930 | * extent items are in the extent map tree. These record which blocks | ||
931 | * are used, and how many references there are to each block | ||
932 | */ | ||
933 | #define BTRFS_EXTENT_ITEM_KEY 168 | ||
934 | #define BTRFS_EXTENT_REF_KEY 180 | ||
935 | |||
936 | /* | ||
937 | * block groups give us hints into the extent allocation trees. Which | ||
938 | * blocks are free etc etc | ||
939 | */ | ||
940 | #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 | ||
941 | |||
942 | #define BTRFS_DEV_EXTENT_KEY 204 | ||
943 | #define BTRFS_DEV_ITEM_KEY 216 | ||
944 | #define BTRFS_CHUNK_ITEM_KEY 228 | ||
945 | |||
946 | /* | ||
947 | * string items are for debugging. They just store a short string of | ||
948 | * data in the FS | ||
949 | */ | ||
950 | #define BTRFS_STRING_ITEM_KEY 253 | ||
951 | |||
952 | #define BTRFS_MOUNT_NODATASUM (1 << 0) | ||
953 | #define BTRFS_MOUNT_NODATACOW (1 << 1) | ||
954 | #define BTRFS_MOUNT_NOBARRIER (1 << 2) | ||
955 | #define BTRFS_MOUNT_SSD (1 << 3) | ||
956 | #define BTRFS_MOUNT_DEGRADED (1 << 4) | ||
957 | #define BTRFS_MOUNT_COMPRESS (1 << 5) | ||
958 | |||
959 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) | ||
960 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | ||
961 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ | ||
962 | BTRFS_MOUNT_##opt) | ||
963 | /* | ||
964 | * Inode flags | ||
965 | */ | ||
966 | #define BTRFS_INODE_NODATASUM (1 << 0) | ||
967 | #define BTRFS_INODE_NODATACOW (1 << 1) | ||
968 | #define BTRFS_INODE_READONLY (1 << 2) | ||
969 | #define BTRFS_INODE_NOCOMPRESS (1 << 3) | ||
970 | #define BTRFS_INODE_PREALLOC (1 << 4) | ||
971 | #define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ | ||
972 | ~BTRFS_INODE_##flag) | ||
973 | #define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ | ||
974 | BTRFS_INODE_##flag) | ||
975 | #define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ | ||
976 | BTRFS_INODE_##flag) | ||
977 | /* some macros to generate set/get funcs for the struct fields. This | ||
978 | * assumes there is a lefoo_to_cpu for every type, so lets make a simple | ||
979 | * one for u8: | ||
980 | */ | ||
981 | #define le8_to_cpu(v) (v) | ||
982 | #define cpu_to_le8(v) (v) | ||
983 | #define __le8 u8 | ||
984 | |||
985 | #define read_eb_member(eb, ptr, type, member, result) ( \ | ||
986 | read_extent_buffer(eb, (char *)(result), \ | ||
987 | ((unsigned long)(ptr)) + \ | ||
988 | offsetof(type, member), \ | ||
989 | sizeof(((type *)0)->member))) | ||
990 | |||
991 | #define write_eb_member(eb, ptr, type, member, result) ( \ | ||
992 | write_extent_buffer(eb, (char *)(result), \ | ||
993 | ((unsigned long)(ptr)) + \ | ||
994 | offsetof(type, member), \ | ||
995 | sizeof(((type *)0)->member))) | ||
996 | |||
997 | #ifndef BTRFS_SETGET_FUNCS | ||
998 | #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ | ||
999 | u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ | ||
1000 | void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); | ||
1001 | #endif | ||
1002 | |||
1003 | #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ | ||
1004 | static inline u##bits btrfs_##name(struct extent_buffer *eb) \ | ||
1005 | { \ | ||
1006 | type *p = kmap_atomic(eb->first_page, KM_USER0); \ | ||
1007 | u##bits res = le##bits##_to_cpu(p->member); \ | ||
1008 | kunmap_atomic(p, KM_USER0); \ | ||
1009 | return res; \ | ||
1010 | } \ | ||
1011 | static inline void btrfs_set_##name(struct extent_buffer *eb, \ | ||
1012 | u##bits val) \ | ||
1013 | { \ | ||
1014 | type *p = kmap_atomic(eb->first_page, KM_USER0); \ | ||
1015 | p->member = cpu_to_le##bits(val); \ | ||
1016 | kunmap_atomic(p, KM_USER0); \ | ||
1017 | } | ||
1018 | |||
1019 | #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ | ||
1020 | static inline u##bits btrfs_##name(type *s) \ | ||
1021 | { \ | ||
1022 | return le##bits##_to_cpu(s->member); \ | ||
1023 | } \ | ||
1024 | static inline void btrfs_set_##name(type *s, u##bits val) \ | ||
1025 | { \ | ||
1026 | s->member = cpu_to_le##bits(val); \ | ||
1027 | } | ||
1028 | |||
1029 | BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); | ||
1030 | BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); | ||
1031 | BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); | ||
1032 | BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); | ||
1033 | BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); | ||
1034 | BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, | ||
1035 | start_offset, 64); | ||
1036 | BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); | ||
1037 | BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); | ||
1038 | BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); | ||
1039 | BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); | ||
1040 | BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); | ||
1041 | BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); | ||
1042 | |||
1043 | BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); | ||
1044 | BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, | ||
1045 | total_bytes, 64); | ||
1046 | BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, | ||
1047 | bytes_used, 64); | ||
1048 | BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, | ||
1049 | io_align, 32); | ||
1050 | BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, | ||
1051 | io_width, 32); | ||
1052 | BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, | ||
1053 | sector_size, 32); | ||
1054 | BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); | ||
1055 | BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, | ||
1056 | dev_group, 32); | ||
1057 | BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, | ||
1058 | seek_speed, 8); | ||
1059 | BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, | ||
1060 | bandwidth, 8); | ||
1061 | BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, | ||
1062 | generation, 64); | ||
1063 | |||
1064 | static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) | ||
1065 | { | ||
1066 | return (char *)d + offsetof(struct btrfs_dev_item, uuid); | ||
1067 | } | ||
1068 | |||
1069 | static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) | ||
1070 | { | ||
1071 | return (char *)d + offsetof(struct btrfs_dev_item, fsid); | ||
1072 | } | ||
1073 | |||
1074 | BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); | ||
1075 | BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); | ||
1076 | BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); | ||
1077 | BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); | ||
1078 | BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); | ||
1079 | BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); | ||
1080 | BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); | ||
1081 | BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); | ||
1082 | BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); | ||
1083 | BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); | ||
1084 | BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); | ||
1085 | |||
1086 | static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) | ||
1087 | { | ||
1088 | return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); | ||
1089 | } | ||
1090 | |||
1091 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); | ||
1092 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); | ||
1093 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, | ||
1094 | stripe_len, 64); | ||
1095 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, | ||
1096 | io_align, 32); | ||
1097 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, | ||
1098 | io_width, 32); | ||
1099 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, | ||
1100 | sector_size, 32); | ||
1101 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); | ||
1102 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, | ||
1103 | num_stripes, 16); | ||
1104 | BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, | ||
1105 | sub_stripes, 16); | ||
1106 | BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); | ||
1107 | BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); | ||
1108 | |||
1109 | static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, | ||
1110 | int nr) | ||
1111 | { | ||
1112 | unsigned long offset = (unsigned long)c; | ||
1113 | offset += offsetof(struct btrfs_chunk, stripe); | ||
1114 | offset += nr * sizeof(struct btrfs_stripe); | ||
1115 | return (struct btrfs_stripe *)offset; | ||
1116 | } | ||
1117 | |||
1118 | static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) | ||
1119 | { | ||
1120 | return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); | ||
1121 | } | ||
1122 | |||
1123 | static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, | ||
1124 | struct btrfs_chunk *c, int nr) | ||
1125 | { | ||
1126 | return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); | ||
1127 | } | ||
1128 | |||
1129 | static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, | ||
1130 | struct btrfs_chunk *c, int nr, | ||
1131 | u64 val) | ||
1132 | { | ||
1133 | btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); | ||
1134 | } | ||
1135 | |||
1136 | static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, | ||
1137 | struct btrfs_chunk *c, int nr) | ||
1138 | { | ||
1139 | return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); | ||
1140 | } | ||
1141 | |||
1142 | static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, | ||
1143 | struct btrfs_chunk *c, int nr, | ||
1144 | u64 val) | ||
1145 | { | ||
1146 | btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); | ||
1147 | } | ||
1148 | |||
1149 | /* struct btrfs_block_group_item */ | ||
1150 | BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, | ||
1151 | used, 64); | ||
1152 | BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, | ||
1153 | used, 64); | ||
1154 | BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, | ||
1155 | struct btrfs_block_group_item, chunk_objectid, 64); | ||
1156 | |||
1157 | BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, | ||
1158 | struct btrfs_block_group_item, chunk_objectid, 64); | ||
1159 | BTRFS_SETGET_FUNCS(disk_block_group_flags, | ||
1160 | struct btrfs_block_group_item, flags, 64); | ||
1161 | BTRFS_SETGET_STACK_FUNCS(block_group_flags, | ||
1162 | struct btrfs_block_group_item, flags, 64); | ||
1163 | |||
1164 | /* struct btrfs_inode_ref */ | ||
1165 | BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); | ||
1166 | BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); | ||
1167 | |||
1168 | /* struct btrfs_inode_item */ | ||
1169 | BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); | ||
1170 | BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); | ||
1171 | BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); | ||
1172 | BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); | ||
1173 | BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); | ||
1174 | BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); | ||
1175 | BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); | ||
1176 | BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); | ||
1177 | BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); | ||
1178 | BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); | ||
1179 | BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); | ||
1180 | BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); | ||
1181 | |||
1182 | static inline struct btrfs_timespec * | ||
1183 | btrfs_inode_atime(struct btrfs_inode_item *inode_item) | ||
1184 | { | ||
1185 | unsigned long ptr = (unsigned long)inode_item; | ||
1186 | ptr += offsetof(struct btrfs_inode_item, atime); | ||
1187 | return (struct btrfs_timespec *)ptr; | ||
1188 | } | ||
1189 | |||
1190 | static inline struct btrfs_timespec * | ||
1191 | btrfs_inode_mtime(struct btrfs_inode_item *inode_item) | ||
1192 | { | ||
1193 | unsigned long ptr = (unsigned long)inode_item; | ||
1194 | ptr += offsetof(struct btrfs_inode_item, mtime); | ||
1195 | return (struct btrfs_timespec *)ptr; | ||
1196 | } | ||
1197 | |||
1198 | static inline struct btrfs_timespec * | ||
1199 | btrfs_inode_ctime(struct btrfs_inode_item *inode_item) | ||
1200 | { | ||
1201 | unsigned long ptr = (unsigned long)inode_item; | ||
1202 | ptr += offsetof(struct btrfs_inode_item, ctime); | ||
1203 | return (struct btrfs_timespec *)ptr; | ||
1204 | } | ||
1205 | |||
1206 | static inline struct btrfs_timespec * | ||
1207 | btrfs_inode_otime(struct btrfs_inode_item *inode_item) | ||
1208 | { | ||
1209 | unsigned long ptr = (unsigned long)inode_item; | ||
1210 | ptr += offsetof(struct btrfs_inode_item, otime); | ||
1211 | return (struct btrfs_timespec *)ptr; | ||
1212 | } | ||
1213 | |||
1214 | BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); | ||
1215 | BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); | ||
1216 | |||
1217 | /* struct btrfs_dev_extent */ | ||
1218 | BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, | ||
1219 | chunk_tree, 64); | ||
1220 | BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, | ||
1221 | chunk_objectid, 64); | ||
1222 | BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, | ||
1223 | chunk_offset, 64); | ||
1224 | BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); | ||
1225 | |||
1226 | static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) | ||
1227 | { | ||
1228 | unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); | ||
1229 | return (u8 *)((unsigned long)dev + ptr); | ||
1230 | } | ||
1231 | |||
1232 | /* struct btrfs_extent_ref */ | ||
1233 | BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); | ||
1234 | BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); | ||
1235 | BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); | ||
1236 | BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); | ||
1237 | |||
1238 | BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); | ||
1239 | BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, | ||
1240 | generation, 64); | ||
1241 | BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, | ||
1242 | objectid, 64); | ||
1243 | BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, | ||
1244 | num_refs, 32); | ||
1245 | |||
1246 | /* struct btrfs_extent_item */ | ||
1247 | BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); | ||
1248 | BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, | ||
1249 | refs, 32); | ||
1250 | |||
1251 | /* struct btrfs_node */ | ||
1252 | BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); | ||
1253 | BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); | ||
1254 | |||
1255 | static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) | ||
1256 | { | ||
1257 | unsigned long ptr; | ||
1258 | ptr = offsetof(struct btrfs_node, ptrs) + | ||
1259 | sizeof(struct btrfs_key_ptr) * nr; | ||
1260 | return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); | ||
1261 | } | ||
1262 | |||
1263 | static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, | ||
1264 | int nr, u64 val) | ||
1265 | { | ||
1266 | unsigned long ptr; | ||
1267 | ptr = offsetof(struct btrfs_node, ptrs) + | ||
1268 | sizeof(struct btrfs_key_ptr) * nr; | ||
1269 | btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); | ||
1270 | } | ||
1271 | |||
1272 | static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) | ||
1273 | { | ||
1274 | unsigned long ptr; | ||
1275 | ptr = offsetof(struct btrfs_node, ptrs) + | ||
1276 | sizeof(struct btrfs_key_ptr) * nr; | ||
1277 | return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); | ||
1278 | } | ||
1279 | |||
1280 | static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, | ||
1281 | int nr, u64 val) | ||
1282 | { | ||
1283 | unsigned long ptr; | ||
1284 | ptr = offsetof(struct btrfs_node, ptrs) + | ||
1285 | sizeof(struct btrfs_key_ptr) * nr; | ||
1286 | btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); | ||
1287 | } | ||
1288 | |||
1289 | static inline unsigned long btrfs_node_key_ptr_offset(int nr) | ||
1290 | { | ||
1291 | return offsetof(struct btrfs_node, ptrs) + | ||
1292 | sizeof(struct btrfs_key_ptr) * nr; | ||
1293 | } | ||
1294 | |||
1295 | void btrfs_node_key(struct extent_buffer *eb, | ||
1296 | struct btrfs_disk_key *disk_key, int nr); | ||
1297 | |||
1298 | static inline void btrfs_set_node_key(struct extent_buffer *eb, | ||
1299 | struct btrfs_disk_key *disk_key, int nr) | ||
1300 | { | ||
1301 | unsigned long ptr; | ||
1302 | ptr = btrfs_node_key_ptr_offset(nr); | ||
1303 | write_eb_member(eb, (struct btrfs_key_ptr *)ptr, | ||
1304 | struct btrfs_key_ptr, key, disk_key); | ||
1305 | } | ||
1306 | |||
1307 | /* struct btrfs_item */ | ||
1308 | BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); | ||
1309 | BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); | ||
1310 | |||
1311 | static inline unsigned long btrfs_item_nr_offset(int nr) | ||
1312 | { | ||
1313 | return offsetof(struct btrfs_leaf, items) + | ||
1314 | sizeof(struct btrfs_item) * nr; | ||
1315 | } | ||
1316 | |||
1317 | static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, | ||
1318 | int nr) | ||
1319 | { | ||
1320 | return (struct btrfs_item *)btrfs_item_nr_offset(nr); | ||
1321 | } | ||
1322 | |||
1323 | static inline u32 btrfs_item_end(struct extent_buffer *eb, | ||
1324 | struct btrfs_item *item) | ||
1325 | { | ||
1326 | return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); | ||
1327 | } | ||
1328 | |||
1329 | static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) | ||
1330 | { | ||
1331 | return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); | ||
1332 | } | ||
1333 | |||
1334 | static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) | ||
1335 | { | ||
1336 | return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); | ||
1337 | } | ||
1338 | |||
1339 | static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) | ||
1340 | { | ||
1341 | return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); | ||
1342 | } | ||
1343 | |||
1344 | static inline void btrfs_item_key(struct extent_buffer *eb, | ||
1345 | struct btrfs_disk_key *disk_key, int nr) | ||
1346 | { | ||
1347 | struct btrfs_item *item = btrfs_item_nr(eb, nr); | ||
1348 | read_eb_member(eb, item, struct btrfs_item, key, disk_key); | ||
1349 | } | ||
1350 | |||
1351 | static inline void btrfs_set_item_key(struct extent_buffer *eb, | ||
1352 | struct btrfs_disk_key *disk_key, int nr) | ||
1353 | { | ||
1354 | struct btrfs_item *item = btrfs_item_nr(eb, nr); | ||
1355 | write_eb_member(eb, item, struct btrfs_item, key, disk_key); | ||
1356 | } | ||
1357 | |||
1358 | BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); | ||
1359 | |||
1360 | /* | ||
1361 | * struct btrfs_root_ref | ||
1362 | */ | ||
1363 | BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); | ||
1364 | BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); | ||
1365 | BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); | ||
1366 | |||
1367 | /* struct btrfs_dir_item */ | ||
1368 | BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); | ||
1369 | BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); | ||
1370 | BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); | ||
1371 | BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); | ||
1372 | |||
1373 | static inline void btrfs_dir_item_key(struct extent_buffer *eb, | ||
1374 | struct btrfs_dir_item *item, | ||
1375 | struct btrfs_disk_key *key) | ||
1376 | { | ||
1377 | read_eb_member(eb, item, struct btrfs_dir_item, location, key); | ||
1378 | } | ||
1379 | |||
1380 | static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, | ||
1381 | struct btrfs_dir_item *item, | ||
1382 | struct btrfs_disk_key *key) | ||
1383 | { | ||
1384 | write_eb_member(eb, item, struct btrfs_dir_item, location, key); | ||
1385 | } | ||
1386 | |||
1387 | /* struct btrfs_disk_key */ | ||
1388 | BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, | ||
1389 | objectid, 64); | ||
1390 | BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); | ||
1391 | BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); | ||
1392 | |||
1393 | static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, | ||
1394 | struct btrfs_disk_key *disk) | ||
1395 | { | ||
1396 | cpu->offset = le64_to_cpu(disk->offset); | ||
1397 | cpu->type = disk->type; | ||
1398 | cpu->objectid = le64_to_cpu(disk->objectid); | ||
1399 | } | ||
1400 | |||
1401 | static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, | ||
1402 | struct btrfs_key *cpu) | ||
1403 | { | ||
1404 | disk->offset = cpu_to_le64(cpu->offset); | ||
1405 | disk->type = cpu->type; | ||
1406 | disk->objectid = cpu_to_le64(cpu->objectid); | ||
1407 | } | ||
1408 | |||
1409 | static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, | ||
1410 | struct btrfs_key *key, int nr) | ||
1411 | { | ||
1412 | struct btrfs_disk_key disk_key; | ||
1413 | btrfs_node_key(eb, &disk_key, nr); | ||
1414 | btrfs_disk_key_to_cpu(key, &disk_key); | ||
1415 | } | ||
1416 | |||
1417 | static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, | ||
1418 | struct btrfs_key *key, int nr) | ||
1419 | { | ||
1420 | struct btrfs_disk_key disk_key; | ||
1421 | btrfs_item_key(eb, &disk_key, nr); | ||
1422 | btrfs_disk_key_to_cpu(key, &disk_key); | ||
1423 | } | ||
1424 | |||
1425 | static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, | ||
1426 | struct btrfs_dir_item *item, | ||
1427 | struct btrfs_key *key) | ||
1428 | { | ||
1429 | struct btrfs_disk_key disk_key; | ||
1430 | btrfs_dir_item_key(eb, item, &disk_key); | ||
1431 | btrfs_disk_key_to_cpu(key, &disk_key); | ||
1432 | } | ||
1433 | |||
1434 | |||
1435 | static inline u8 btrfs_key_type(struct btrfs_key *key) | ||
1436 | { | ||
1437 | return key->type; | ||
1438 | } | ||
1439 | |||
1440 | static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) | ||
1441 | { | ||
1442 | key->type = val; | ||
1443 | } | ||
1444 | |||
1445 | /* struct btrfs_header */ | ||
1446 | BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); | ||
1447 | BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, | ||
1448 | generation, 64); | ||
1449 | BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); | ||
1450 | BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); | ||
1451 | BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); | ||
1452 | BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); | ||
1453 | |||
1454 | static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) | ||
1455 | { | ||
1456 | return (btrfs_header_flags(eb) & flag) == flag; | ||
1457 | } | ||
1458 | |||
1459 | static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) | ||
1460 | { | ||
1461 | u64 flags = btrfs_header_flags(eb); | ||
1462 | btrfs_set_header_flags(eb, flags | flag); | ||
1463 | return (flags & flag) == flag; | ||
1464 | } | ||
1465 | |||
1466 | static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) | ||
1467 | { | ||
1468 | u64 flags = btrfs_header_flags(eb); | ||
1469 | btrfs_set_header_flags(eb, flags & ~flag); | ||
1470 | return (flags & flag) == flag; | ||
1471 | } | ||
1472 | |||
1473 | static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) | ||
1474 | { | ||
1475 | unsigned long ptr = offsetof(struct btrfs_header, fsid); | ||
1476 | return (u8 *)ptr; | ||
1477 | } | ||
1478 | |||
1479 | static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) | ||
1480 | { | ||
1481 | unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); | ||
1482 | return (u8 *)ptr; | ||
1483 | } | ||
1484 | |||
1485 | static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) | ||
1486 | { | ||
1487 | unsigned long ptr = offsetof(struct btrfs_super_block, fsid); | ||
1488 | return (u8 *)ptr; | ||
1489 | } | ||
1490 | |||
1491 | static inline u8 *btrfs_header_csum(struct extent_buffer *eb) | ||
1492 | { | ||
1493 | unsigned long ptr = offsetof(struct btrfs_header, csum); | ||
1494 | return (u8 *)ptr; | ||
1495 | } | ||
1496 | |||
1497 | static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) | ||
1498 | { | ||
1499 | return NULL; | ||
1500 | } | ||
1501 | |||
1502 | static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) | ||
1503 | { | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | |||
1507 | static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) | ||
1508 | { | ||
1509 | return NULL; | ||
1510 | } | ||
1511 | |||
1512 | static inline int btrfs_is_leaf(struct extent_buffer *eb) | ||
1513 | { | ||
1514 | return btrfs_header_level(eb) == 0; | ||
1515 | } | ||
1516 | |||
1517 | /* struct btrfs_root_item */ | ||
1518 | BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, | ||
1519 | generation, 64); | ||
1520 | BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); | ||
1521 | BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); | ||
1522 | BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); | ||
1523 | |||
1524 | BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, | ||
1525 | generation, 64); | ||
1526 | BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); | ||
1527 | BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); | ||
1528 | BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); | ||
1529 | BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); | ||
1530 | BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); | ||
1531 | BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); | ||
1532 | BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); | ||
1533 | BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, | ||
1534 | last_snapshot, 64); | ||
1535 | |||
1536 | /* struct btrfs_super_block */ | ||
1537 | |||
1538 | BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); | ||
1539 | BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); | ||
1540 | BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, | ||
1541 | generation, 64); | ||
1542 | BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); | ||
1543 | BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, | ||
1544 | struct btrfs_super_block, sys_chunk_array_size, 32); | ||
1545 | BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, | ||
1546 | struct btrfs_super_block, chunk_root_generation, 64); | ||
1547 | BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, | ||
1548 | root_level, 8); | ||
1549 | BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, | ||
1550 | chunk_root, 64); | ||
1551 | BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, | ||
1552 | chunk_root_level, 8); | ||
1553 | BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, | ||
1554 | log_root, 64); | ||
1555 | BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, | ||
1556 | log_root_transid, 64); | ||
1557 | BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, | ||
1558 | log_root_level, 8); | ||
1559 | BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, | ||
1560 | total_bytes, 64); | ||
1561 | BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, | ||
1562 | bytes_used, 64); | ||
1563 | BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, | ||
1564 | sectorsize, 32); | ||
1565 | BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, | ||
1566 | nodesize, 32); | ||
1567 | BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, | ||
1568 | leafsize, 32); | ||
1569 | BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, | ||
1570 | stripesize, 32); | ||
1571 | BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, | ||
1572 | root_dir_objectid, 64); | ||
1573 | BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, | ||
1574 | num_devices, 64); | ||
1575 | BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, | ||
1576 | compat_flags, 64); | ||
1577 | BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, | ||
1578 | compat_flags, 64); | ||
1579 | BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, | ||
1580 | incompat_flags, 64); | ||
1581 | BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, | ||
1582 | csum_type, 16); | ||
1583 | |||
1584 | static inline int btrfs_super_csum_size(struct btrfs_super_block *s) | ||
1585 | { | ||
1586 | int t = btrfs_super_csum_type(s); | ||
1587 | BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); | ||
1588 | return btrfs_csum_sizes[t]; | ||
1589 | } | ||
1590 | |||
1591 | static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) | ||
1592 | { | ||
1593 | return offsetof(struct btrfs_leaf, items); | ||
1594 | } | ||
1595 | |||
1596 | /* struct btrfs_file_extent_item */ | ||
1597 | BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); | ||
1598 | |||
1599 | static inline unsigned long | ||
1600 | btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) | ||
1601 | { | ||
1602 | unsigned long offset = (unsigned long)e; | ||
1603 | offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); | ||
1604 | return offset; | ||
1605 | } | ||
1606 | |||
1607 | static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) | ||
1608 | { | ||
1609 | return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; | ||
1610 | } | ||
1611 | |||
1612 | BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, | ||
1613 | disk_bytenr, 64); | ||
1614 | BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, | ||
1615 | generation, 64); | ||
1616 | BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, | ||
1617 | disk_num_bytes, 64); | ||
1618 | BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, | ||
1619 | offset, 64); | ||
1620 | BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, | ||
1621 | num_bytes, 64); | ||
1622 | BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, | ||
1623 | ram_bytes, 64); | ||
1624 | BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, | ||
1625 | compression, 8); | ||
1626 | BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, | ||
1627 | encryption, 8); | ||
1628 | BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, | ||
1629 | other_encoding, 16); | ||
1630 | |||
1631 | /* this returns the number of file bytes represented by the inline item. | ||
1632 | * If an item is compressed, this is the uncompressed size | ||
1633 | */ | ||
1634 | static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, | ||
1635 | struct btrfs_file_extent_item *e) | ||
1636 | { | ||
1637 | return btrfs_file_extent_ram_bytes(eb, e); | ||
1638 | } | ||
1639 | |||
1640 | /* | ||
1641 | * this returns the number of bytes used by the item on disk, minus the | ||
1642 | * size of any extent headers. If a file is compressed on disk, this is | ||
1643 | * the compressed size | ||
1644 | */ | ||
1645 | static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, | ||
1646 | struct btrfs_item *e) | ||
1647 | { | ||
1648 | unsigned long offset; | ||
1649 | offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); | ||
1650 | return btrfs_item_size(eb, e) - offset; | ||
1651 | } | ||
1652 | |||
1653 | static inline struct btrfs_root *btrfs_sb(struct super_block *sb) | ||
1654 | { | ||
1655 | return sb->s_fs_info; | ||
1656 | } | ||
1657 | |||
1658 | static inline int btrfs_set_root_name(struct btrfs_root *root, | ||
1659 | const char *name, int len) | ||
1660 | { | ||
1661 | /* if we already have a name just free it */ | ||
1662 | kfree(root->name); | ||
1663 | |||
1664 | root->name = kmalloc(len+1, GFP_KERNEL); | ||
1665 | if (!root->name) | ||
1666 | return -ENOMEM; | ||
1667 | |||
1668 | memcpy(root->name, name, len); | ||
1669 | root->name[len] = '\0'; | ||
1670 | |||
1671 | return 0; | ||
1672 | } | ||
1673 | |||
1674 | static inline u32 btrfs_level_size(struct btrfs_root *root, int level) | ||
1675 | { | ||
1676 | if (level == 0) | ||
1677 | return root->leafsize; | ||
1678 | return root->nodesize; | ||
1679 | } | ||
1680 | |||
1681 | /* helper function to cast into the data area of the leaf. */ | ||
1682 | #define btrfs_item_ptr(leaf, slot, type) \ | ||
1683 | ((type *)(btrfs_leaf_data(leaf) + \ | ||
1684 | btrfs_item_offset_nr(leaf, slot))) | ||
1685 | |||
1686 | #define btrfs_item_ptr_offset(leaf, slot) \ | ||
1687 | ((unsigned long)(btrfs_leaf_data(leaf) + \ | ||
1688 | btrfs_item_offset_nr(leaf, slot))) | ||
1689 | |||
1690 | static inline struct dentry *fdentry(struct file *file) | ||
1691 | { | ||
1692 | return file->f_path.dentry; | ||
1693 | } | ||
1694 | |||
1695 | /* extent-tree.c */ | ||
1696 | int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); | ||
1697 | int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, | ||
1698 | struct btrfs_root *root, u64 bytenr, | ||
1699 | u64 num_bytes, u32 *refs); | ||
1700 | int btrfs_update_pinned_extents(struct btrfs_root *root, | ||
1701 | u64 bytenr, u64 num, int pin); | ||
1702 | int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, | ||
1703 | struct btrfs_root *root, struct extent_buffer *leaf); | ||
1704 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, | ||
1705 | struct btrfs_root *root, u64 objectid, u64 bytenr); | ||
1706 | int btrfs_extent_post_op(struct btrfs_trans_handle *trans, | ||
1707 | struct btrfs_root *root); | ||
1708 | int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); | ||
1709 | struct btrfs_block_group_cache *btrfs_lookup_block_group( | ||
1710 | struct btrfs_fs_info *info, | ||
1711 | u64 bytenr); | ||
1712 | u64 btrfs_find_block_group(struct btrfs_root *root, | ||
1713 | u64 search_start, u64 search_hint, int owner); | ||
1714 | struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | ||
1715 | struct btrfs_root *root, | ||
1716 | u32 blocksize, u64 parent, | ||
1717 | u64 root_objectid, | ||
1718 | u64 ref_generation, | ||
1719 | int level, | ||
1720 | u64 hint, | ||
1721 | u64 empty_size); | ||
1722 | struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, | ||
1723 | struct btrfs_root *root, | ||
1724 | u64 bytenr, u32 blocksize); | ||
1725 | int btrfs_alloc_extent(struct btrfs_trans_handle *trans, | ||
1726 | struct btrfs_root *root, | ||
1727 | u64 num_bytes, u64 parent, u64 min_bytes, | ||
1728 | u64 root_objectid, u64 ref_generation, | ||
1729 | u64 owner, u64 empty_size, u64 hint_byte, | ||
1730 | u64 search_end, struct btrfs_key *ins, u64 data); | ||
1731 | int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, | ||
1732 | struct btrfs_root *root, u64 parent, | ||
1733 | u64 root_objectid, u64 ref_generation, | ||
1734 | u64 owner, struct btrfs_key *ins); | ||
1735 | int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, | ||
1736 | struct btrfs_root *root, u64 parent, | ||
1737 | u64 root_objectid, u64 ref_generation, | ||
1738 | u64 owner, struct btrfs_key *ins); | ||
1739 | int btrfs_reserve_extent(struct btrfs_trans_handle *trans, | ||
1740 | struct btrfs_root *root, | ||
1741 | u64 num_bytes, u64 min_alloc_size, | ||
1742 | u64 empty_size, u64 hint_byte, | ||
1743 | u64 search_end, struct btrfs_key *ins, | ||
1744 | u64 data); | ||
1745 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1746 | struct extent_buffer *orig_buf, struct extent_buffer *buf, | ||
1747 | u32 *nr_extents); | ||
1748 | int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1749 | struct extent_buffer *buf, u32 nr_extents); | ||
1750 | int btrfs_update_ref(struct btrfs_trans_handle *trans, | ||
1751 | struct btrfs_root *root, struct extent_buffer *orig_buf, | ||
1752 | struct extent_buffer *buf, int start_slot, int nr); | ||
1753 | int btrfs_free_extent(struct btrfs_trans_handle *trans, | ||
1754 | struct btrfs_root *root, | ||
1755 | u64 bytenr, u64 num_bytes, u64 parent, | ||
1756 | u64 root_objectid, u64 ref_generation, | ||
1757 | u64 owner_objectid, int pin); | ||
1758 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); | ||
1759 | int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | ||
1760 | struct btrfs_root *root, | ||
1761 | struct extent_io_tree *unpin); | ||
1762 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | ||
1763 | struct btrfs_root *root, | ||
1764 | u64 bytenr, u64 num_bytes, u64 parent, | ||
1765 | u64 root_objectid, u64 ref_generation, | ||
1766 | u64 owner_objectid); | ||
1767 | int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, | ||
1768 | struct btrfs_root *root, u64 bytenr, | ||
1769 | u64 orig_parent, u64 parent, | ||
1770 | u64 root_objectid, u64 ref_generation, | ||
1771 | u64 owner_objectid); | ||
1772 | int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, | ||
1773 | struct btrfs_root *root); | ||
1774 | int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); | ||
1775 | int btrfs_free_block_groups(struct btrfs_fs_info *info); | ||
1776 | int btrfs_read_block_groups(struct btrfs_root *root); | ||
1777 | int btrfs_make_block_group(struct btrfs_trans_handle *trans, | ||
1778 | struct btrfs_root *root, u64 bytes_used, | ||
1779 | u64 type, u64 chunk_objectid, u64 chunk_offset, | ||
1780 | u64 size); | ||
1781 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | ||
1782 | struct btrfs_root *root, u64 group_start); | ||
1783 | int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); | ||
1784 | int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, | ||
1785 | struct btrfs_root *root); | ||
1786 | int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); | ||
1787 | int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, | ||
1788 | struct btrfs_root *root, | ||
1789 | struct extent_buffer *buf, u64 orig_start); | ||
1790 | int btrfs_add_dead_reloc_root(struct btrfs_root *root); | ||
1791 | int btrfs_cleanup_reloc_trees(struct btrfs_root *root); | ||
1792 | int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); | ||
1793 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); | ||
1794 | /* ctree.c */ | ||
1795 | int btrfs_previous_item(struct btrfs_root *root, | ||
1796 | struct btrfs_path *path, u64 min_objectid, | ||
1797 | int type); | ||
1798 | int btrfs_merge_path(struct btrfs_trans_handle *trans, | ||
1799 | struct btrfs_root *root, | ||
1800 | struct btrfs_key *node_keys, | ||
1801 | u64 *nodes, int lowest_level); | ||
1802 | int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, | ||
1803 | struct btrfs_root *root, struct btrfs_path *path, | ||
1804 | struct btrfs_key *new_key); | ||
1805 | struct extent_buffer *btrfs_root_node(struct btrfs_root *root); | ||
1806 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); | ||
1807 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | ||
1808 | struct btrfs_key *key, int lowest_level, | ||
1809 | int cache_only, u64 min_trans); | ||
1810 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | ||
1811 | struct btrfs_key *max_key, | ||
1812 | struct btrfs_path *path, int cache_only, | ||
1813 | u64 min_trans); | ||
1814 | int btrfs_cow_block(struct btrfs_trans_handle *trans, | ||
1815 | struct btrfs_root *root, struct extent_buffer *buf, | ||
1816 | struct extent_buffer *parent, int parent_slot, | ||
1817 | struct extent_buffer **cow_ret, u64 prealloc_dest); | ||
1818 | int btrfs_copy_root(struct btrfs_trans_handle *trans, | ||
1819 | struct btrfs_root *root, | ||
1820 | struct extent_buffer *buf, | ||
1821 | struct extent_buffer **cow_ret, u64 new_root_objectid); | ||
1822 | int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1823 | *root, struct btrfs_path *path, u32 data_size); | ||
1824 | int btrfs_truncate_item(struct btrfs_trans_handle *trans, | ||
1825 | struct btrfs_root *root, | ||
1826 | struct btrfs_path *path, | ||
1827 | u32 new_size, int from_end); | ||
1828 | int btrfs_split_item(struct btrfs_trans_handle *trans, | ||
1829 | struct btrfs_root *root, | ||
1830 | struct btrfs_path *path, | ||
1831 | struct btrfs_key *new_key, | ||
1832 | unsigned long split_offset); | ||
1833 | int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1834 | *root, struct btrfs_key *key, struct btrfs_path *p, int | ||
1835 | ins_len, int cow); | ||
1836 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | ||
1837 | struct btrfs_root *root, struct extent_buffer *parent, | ||
1838 | int start_slot, int cache_only, u64 *last_ret, | ||
1839 | struct btrfs_key *progress); | ||
1840 | void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); | ||
1841 | struct btrfs_path *btrfs_alloc_path(void); | ||
1842 | void btrfs_free_path(struct btrfs_path *p); | ||
1843 | void btrfs_init_path(struct btrfs_path *p); | ||
1844 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1845 | struct btrfs_path *path, int slot, int nr); | ||
1846 | int btrfs_del_leaf(struct btrfs_trans_handle *trans, | ||
1847 | struct btrfs_root *root, | ||
1848 | struct btrfs_path *path, u64 bytenr); | ||
1849 | static inline int btrfs_del_item(struct btrfs_trans_handle *trans, | ||
1850 | struct btrfs_root *root, | ||
1851 | struct btrfs_path *path) | ||
1852 | { | ||
1853 | return btrfs_del_items(trans, root, path, path->slots[0], 1); | ||
1854 | } | ||
1855 | |||
1856 | int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1857 | *root, struct btrfs_key *key, void *data, u32 data_size); | ||
1858 | int btrfs_insert_some_items(struct btrfs_trans_handle *trans, | ||
1859 | struct btrfs_root *root, | ||
1860 | struct btrfs_path *path, | ||
1861 | struct btrfs_key *cpu_key, u32 *data_size, | ||
1862 | int nr); | ||
1863 | int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, | ||
1864 | struct btrfs_root *root, | ||
1865 | struct btrfs_path *path, | ||
1866 | struct btrfs_key *cpu_key, u32 *data_size, int nr); | ||
1867 | |||
1868 | static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, | ||
1869 | struct btrfs_root *root, | ||
1870 | struct btrfs_path *path, | ||
1871 | struct btrfs_key *key, | ||
1872 | u32 data_size) | ||
1873 | { | ||
1874 | return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); | ||
1875 | } | ||
1876 | |||
1877 | int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); | ||
1878 | int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); | ||
1879 | int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); | ||
1880 | int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1881 | *root); | ||
1882 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | ||
1883 | struct btrfs_root *root, | ||
1884 | struct extent_buffer *node, | ||
1885 | struct extent_buffer *parent); | ||
1886 | /* root-item.c */ | ||
1887 | int btrfs_find_root_ref(struct btrfs_root *tree_root, | ||
1888 | struct btrfs_path *path, | ||
1889 | u64 root_id, u64 ref_id); | ||
1890 | int btrfs_add_root_ref(struct btrfs_trans_handle *trans, | ||
1891 | struct btrfs_root *tree_root, | ||
1892 | u64 root_id, u8 type, u64 ref_id, | ||
1893 | u64 dirid, u64 sequence, | ||
1894 | const char *name, int name_len); | ||
1895 | int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1896 | struct btrfs_key *key); | ||
1897 | int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1898 | *root, struct btrfs_key *key, struct btrfs_root_item | ||
1899 | *item); | ||
1900 | int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1901 | *root, struct btrfs_key *key, struct btrfs_root_item | ||
1902 | *item); | ||
1903 | int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct | ||
1904 | btrfs_root_item *item, struct btrfs_key *key); | ||
1905 | int btrfs_search_root(struct btrfs_root *root, u64 search_start, | ||
1906 | u64 *found_objectid); | ||
1907 | int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, | ||
1908 | struct btrfs_root *latest_root); | ||
1909 | /* dir-item.c */ | ||
1910 | int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, | ||
1911 | struct btrfs_root *root, const char *name, | ||
1912 | int name_len, u64 dir, | ||
1913 | struct btrfs_key *location, u8 type, u64 index); | ||
1914 | struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, | ||
1915 | struct btrfs_root *root, | ||
1916 | struct btrfs_path *path, u64 dir, | ||
1917 | const char *name, int name_len, | ||
1918 | int mod); | ||
1919 | struct btrfs_dir_item * | ||
1920 | btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, | ||
1921 | struct btrfs_root *root, | ||
1922 | struct btrfs_path *path, u64 dir, | ||
1923 | u64 objectid, const char *name, int name_len, | ||
1924 | int mod); | ||
1925 | struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, | ||
1926 | struct btrfs_path *path, | ||
1927 | const char *name, int name_len); | ||
1928 | int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, | ||
1929 | struct btrfs_root *root, | ||
1930 | struct btrfs_path *path, | ||
1931 | struct btrfs_dir_item *di); | ||
1932 | int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, | ||
1933 | struct btrfs_root *root, const char *name, | ||
1934 | u16 name_len, const void *data, u16 data_len, | ||
1935 | u64 dir); | ||
1936 | struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | ||
1937 | struct btrfs_root *root, | ||
1938 | struct btrfs_path *path, u64 dir, | ||
1939 | const char *name, u16 name_len, | ||
1940 | int mod); | ||
1941 | |||
1942 | /* orphan.c */ | ||
1943 | int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, | ||
1944 | struct btrfs_root *root, u64 offset); | ||
1945 | int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, | ||
1946 | struct btrfs_root *root, u64 offset); | ||
1947 | |||
1948 | /* inode-map.c */ | ||
1949 | int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, | ||
1950 | struct btrfs_root *fs_root, | ||
1951 | u64 dirid, u64 *objectid); | ||
1952 | int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); | ||
1953 | |||
1954 | /* inode-item.c */ | ||
1955 | int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, | ||
1956 | struct btrfs_root *root, | ||
1957 | const char *name, int name_len, | ||
1958 | u64 inode_objectid, u64 ref_objectid, u64 index); | ||
1959 | int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, | ||
1960 | struct btrfs_root *root, | ||
1961 | const char *name, int name_len, | ||
1962 | u64 inode_objectid, u64 ref_objectid, u64 *index); | ||
1963 | int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, | ||
1964 | struct btrfs_root *root, | ||
1965 | struct btrfs_path *path, u64 objectid); | ||
1966 | int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root | ||
1967 | *root, struct btrfs_path *path, | ||
1968 | struct btrfs_key *location, int mod); | ||
1969 | |||
1970 | /* file-item.c */ | ||
1971 | int btrfs_del_csums(struct btrfs_trans_handle *trans, | ||
1972 | struct btrfs_root *root, u64 bytenr, u64 len); | ||
1973 | int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, | ||
1974 | struct bio *bio, u32 *dst); | ||
1975 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, | ||
1976 | struct btrfs_root *root, | ||
1977 | u64 objectid, u64 pos, | ||
1978 | u64 disk_offset, u64 disk_num_bytes, | ||
1979 | u64 num_bytes, u64 offset, u64 ram_bytes, | ||
1980 | u8 compression, u8 encryption, u16 other_encoding); | ||
1981 | int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | ||
1982 | struct btrfs_root *root, | ||
1983 | struct btrfs_path *path, u64 objectid, | ||
1984 | u64 bytenr, int mod); | ||
1985 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | ||
1986 | struct btrfs_root *root, | ||
1987 | struct btrfs_ordered_sum *sums); | ||
1988 | int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, | ||
1989 | struct bio *bio, u64 file_start, int contig); | ||
1990 | int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, | ||
1991 | u64 start, unsigned long len); | ||
1992 | struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, | ||
1993 | struct btrfs_root *root, | ||
1994 | struct btrfs_path *path, | ||
1995 | u64 bytenr, int cow); | ||
1996 | int btrfs_csum_truncate(struct btrfs_trans_handle *trans, | ||
1997 | struct btrfs_root *root, struct btrfs_path *path, | ||
1998 | u64 isize); | ||
1999 | int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, | ||
2000 | u64 end, struct list_head *list); | ||
2001 | /* inode.c */ | ||
2002 | |||
2003 | /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ | ||
2004 | #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) | ||
2005 | #define ClearPageChecked ClearPageFsMisc | ||
2006 | #define SetPageChecked SetPageFsMisc | ||
2007 | #define PageChecked PageFsMisc | ||
2008 | #endif | ||
2009 | |||
2010 | struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); | ||
2011 | int btrfs_set_inode_index(struct inode *dir, u64 *index); | ||
2012 | int btrfs_unlink_inode(struct btrfs_trans_handle *trans, | ||
2013 | struct btrfs_root *root, | ||
2014 | struct inode *dir, struct inode *inode, | ||
2015 | const char *name, int name_len); | ||
2016 | int btrfs_add_link(struct btrfs_trans_handle *trans, | ||
2017 | struct inode *parent_inode, struct inode *inode, | ||
2018 | const char *name, int name_len, int add_backref, u64 index); | ||
2019 | int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | ||
2020 | struct btrfs_root *root, | ||
2021 | struct inode *inode, u64 new_size, | ||
2022 | u32 min_type); | ||
2023 | |||
2024 | int btrfs_start_delalloc_inodes(struct btrfs_root *root); | ||
2025 | int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); | ||
2026 | int btrfs_writepages(struct address_space *mapping, | ||
2027 | struct writeback_control *wbc); | ||
2028 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | ||
2029 | struct btrfs_root *new_root, struct dentry *dentry, | ||
2030 | u64 new_dirid, u64 alloc_hint); | ||
2031 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | ||
2032 | size_t size, struct bio *bio, unsigned long bio_flags); | ||
2033 | |||
2034 | unsigned long btrfs_force_ra(struct address_space *mapping, | ||
2035 | struct file_ra_state *ra, struct file *file, | ||
2036 | pgoff_t offset, pgoff_t last_index); | ||
2037 | int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, | ||
2038 | int for_del); | ||
2039 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); | ||
2040 | int btrfs_readpage(struct file *file, struct page *page); | ||
2041 | void btrfs_delete_inode(struct inode *inode); | ||
2042 | void btrfs_put_inode(struct inode *inode); | ||
2043 | void btrfs_read_locked_inode(struct inode *inode); | ||
2044 | int btrfs_write_inode(struct inode *inode, int wait); | ||
2045 | void btrfs_dirty_inode(struct inode *inode); | ||
2046 | struct inode *btrfs_alloc_inode(struct super_block *sb); | ||
2047 | void btrfs_destroy_inode(struct inode *inode); | ||
2048 | int btrfs_init_cachep(void); | ||
2049 | void btrfs_destroy_cachep(void); | ||
2050 | long btrfs_ioctl_trans_end(struct file *file); | ||
2051 | struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, | ||
2052 | struct btrfs_root *root, int wait); | ||
2053 | struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, | ||
2054 | struct btrfs_root *root); | ||
2055 | struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, | ||
2056 | struct btrfs_root *root, int *is_new); | ||
2057 | int btrfs_commit_write(struct file *file, struct page *page, | ||
2058 | unsigned from, unsigned to); | ||
2059 | struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, | ||
2060 | size_t page_offset, u64 start, u64 end, | ||
2061 | int create); | ||
2062 | int btrfs_update_inode(struct btrfs_trans_handle *trans, | ||
2063 | struct btrfs_root *root, | ||
2064 | struct inode *inode); | ||
2065 | int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); | ||
2066 | int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); | ||
2067 | void btrfs_orphan_cleanup(struct btrfs_root *root); | ||
2068 | int btrfs_cont_expand(struct inode *inode, loff_t size); | ||
2069 | |||
2070 | /* ioctl.c */ | ||
2071 | long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); | ||
2072 | |||
2073 | /* file.c */ | ||
2074 | int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); | ||
2075 | int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | ||
2076 | int skip_pinned); | ||
2077 | int btrfs_check_file(struct btrfs_root *root, struct inode *inode); | ||
2078 | extern struct file_operations btrfs_file_operations; | ||
2079 | int btrfs_drop_extents(struct btrfs_trans_handle *trans, | ||
2080 | struct btrfs_root *root, struct inode *inode, | ||
2081 | u64 start, u64 end, u64 inline_limit, u64 *hint_block); | ||
2082 | int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, | ||
2083 | struct btrfs_root *root, | ||
2084 | struct inode *inode, u64 start, u64 end); | ||
2085 | int btrfs_release_file(struct inode *inode, struct file *file); | ||
2086 | |||
2087 | /* tree-defrag.c */ | ||
2088 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | ||
2089 | struct btrfs_root *root, int cache_only); | ||
2090 | |||
2091 | /* sysfs.c */ | ||
2092 | int btrfs_init_sysfs(void); | ||
2093 | void btrfs_exit_sysfs(void); | ||
2094 | int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); | ||
2095 | int btrfs_sysfs_add_root(struct btrfs_root *root); | ||
2096 | void btrfs_sysfs_del_root(struct btrfs_root *root); | ||
2097 | void btrfs_sysfs_del_super(struct btrfs_fs_info *root); | ||
2098 | |||
2099 | /* xattr.c */ | ||
2100 | ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); | ||
2101 | |||
2102 | /* super.c */ | ||
2103 | u64 btrfs_parse_size(char *str); | ||
2104 | int btrfs_parse_options(struct btrfs_root *root, char *options); | ||
2105 | int btrfs_sync_fs(struct super_block *sb, int wait); | ||
2106 | |||
2107 | /* acl.c */ | ||
2108 | int btrfs_check_acl(struct inode *inode, int mask); | ||
2109 | int btrfs_init_acl(struct inode *inode, struct inode *dir); | ||
2110 | int btrfs_acl_chmod(struct inode *inode); | ||
2111 | |||
2112 | /* free-space-cache.c */ | ||
2113 | int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, | ||
2114 | u64 bytenr, u64 size); | ||
2115 | int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, | ||
2116 | u64 offset, u64 bytes); | ||
2117 | int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, | ||
2118 | u64 bytenr, u64 size); | ||
2119 | int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, | ||
2120 | u64 offset, u64 bytes); | ||
2121 | void btrfs_remove_free_space_cache(struct btrfs_block_group_cache | ||
2122 | *block_group); | ||
2123 | struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache | ||
2124 | *block_group, u64 offset, | ||
2125 | u64 bytes); | ||
2126 | void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, | ||
2127 | u64 bytes); | ||
2128 | u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); | ||
2129 | #endif | ||
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 000000000000..926a0b287a7d --- /dev/null +++ b/fs/btrfs/dir-item.c | |||
@@ -0,0 +1,386 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | #include "hash.h" | ||
22 | #include "transaction.h" | ||
23 | |||
24 | /* | ||
25 | * insert a name into a directory, doing overflow properly if there is a hash | ||
26 | * collision. data_size indicates how big the item inserted should be. On | ||
27 | * success a struct btrfs_dir_item pointer is returned, otherwise it is | ||
28 | * an ERR_PTR. | ||
29 | * | ||
30 | * The name is not copied into the dir item, you have to do that yourself. | ||
31 | */ | ||
32 | static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle | ||
33 | *trans, | ||
34 | struct btrfs_root *root, | ||
35 | struct btrfs_path *path, | ||
36 | struct btrfs_key *cpu_key, | ||
37 | u32 data_size, | ||
38 | const char *name, | ||
39 | int name_len) | ||
40 | { | ||
41 | int ret; | ||
42 | char *ptr; | ||
43 | struct btrfs_item *item; | ||
44 | struct extent_buffer *leaf; | ||
45 | |||
46 | ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); | ||
47 | if (ret == -EEXIST) { | ||
48 | struct btrfs_dir_item *di; | ||
49 | di = btrfs_match_dir_item_name(root, path, name, name_len); | ||
50 | if (di) | ||
51 | return ERR_PTR(-EEXIST); | ||
52 | ret = btrfs_extend_item(trans, root, path, data_size); | ||
53 | WARN_ON(ret > 0); | ||
54 | } | ||
55 | if (ret < 0) | ||
56 | return ERR_PTR(ret); | ||
57 | WARN_ON(ret > 0); | ||
58 | leaf = path->nodes[0]; | ||
59 | item = btrfs_item_nr(leaf, path->slots[0]); | ||
60 | ptr = btrfs_item_ptr(leaf, path->slots[0], char); | ||
61 | BUG_ON(data_size > btrfs_item_size(leaf, item)); | ||
62 | ptr += btrfs_item_size(leaf, item) - data_size; | ||
63 | return (struct btrfs_dir_item *)ptr; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * xattrs work a lot like directories, this inserts an xattr item | ||
68 | * into the tree | ||
69 | */ | ||
70 | int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, | ||
71 | struct btrfs_root *root, const char *name, | ||
72 | u16 name_len, const void *data, u16 data_len, | ||
73 | u64 dir) | ||
74 | { | ||
75 | int ret = 0; | ||
76 | struct btrfs_path *path; | ||
77 | struct btrfs_dir_item *dir_item; | ||
78 | unsigned long name_ptr, data_ptr; | ||
79 | struct btrfs_key key, location; | ||
80 | struct btrfs_disk_key disk_key; | ||
81 | struct extent_buffer *leaf; | ||
82 | u32 data_size; | ||
83 | |||
84 | key.objectid = dir; | ||
85 | btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); | ||
86 | key.offset = btrfs_name_hash(name, name_len); | ||
87 | path = btrfs_alloc_path(); | ||
88 | if (!path) | ||
89 | return -ENOMEM; | ||
90 | if (name_len + data_len + sizeof(struct btrfs_dir_item) > | ||
91 | BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) | ||
92 | return -ENOSPC; | ||
93 | |||
94 | data_size = sizeof(*dir_item) + name_len + data_len; | ||
95 | dir_item = insert_with_overflow(trans, root, path, &key, data_size, | ||
96 | name, name_len); | ||
97 | /* | ||
98 | * FIXME: at some point we should handle xattr's that are larger than | ||
99 | * what we can fit in our leaf. We set location to NULL b/c we arent | ||
100 | * pointing at anything else, that will change if we store the xattr | ||
101 | * data in a separate inode. | ||
102 | */ | ||
103 | BUG_ON(IS_ERR(dir_item)); | ||
104 | memset(&location, 0, sizeof(location)); | ||
105 | |||
106 | leaf = path->nodes[0]; | ||
107 | btrfs_cpu_key_to_disk(&disk_key, &location); | ||
108 | btrfs_set_dir_item_key(leaf, dir_item, &disk_key); | ||
109 | btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); | ||
110 | btrfs_set_dir_name_len(leaf, dir_item, name_len); | ||
111 | btrfs_set_dir_transid(leaf, dir_item, trans->transid); | ||
112 | btrfs_set_dir_data_len(leaf, dir_item, data_len); | ||
113 | name_ptr = (unsigned long)(dir_item + 1); | ||
114 | data_ptr = (unsigned long)((char *)name_ptr + name_len); | ||
115 | |||
116 | write_extent_buffer(leaf, name, name_ptr, name_len); | ||
117 | write_extent_buffer(leaf, data, data_ptr, data_len); | ||
118 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
119 | |||
120 | btrfs_free_path(path); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * insert a directory item in the tree, doing all the magic for | ||
126 | * both indexes. 'dir' indicates which objectid to insert it into, | ||
127 | * 'location' is the key to stuff into the directory item, 'type' is the | ||
128 | * type of the inode we're pointing to, and 'index' is the sequence number | ||
129 | * to use for the second index (if one is created). | ||
130 | */ | ||
131 | int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root | ||
132 | *root, const char *name, int name_len, u64 dir, | ||
133 | struct btrfs_key *location, u8 type, u64 index) | ||
134 | { | ||
135 | int ret = 0; | ||
136 | int ret2 = 0; | ||
137 | struct btrfs_path *path; | ||
138 | struct btrfs_dir_item *dir_item; | ||
139 | struct extent_buffer *leaf; | ||
140 | unsigned long name_ptr; | ||
141 | struct btrfs_key key; | ||
142 | struct btrfs_disk_key disk_key; | ||
143 | u32 data_size; | ||
144 | |||
145 | key.objectid = dir; | ||
146 | btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); | ||
147 | key.offset = btrfs_name_hash(name, name_len); | ||
148 | path = btrfs_alloc_path(); | ||
149 | data_size = sizeof(*dir_item) + name_len; | ||
150 | dir_item = insert_with_overflow(trans, root, path, &key, data_size, | ||
151 | name, name_len); | ||
152 | if (IS_ERR(dir_item)) { | ||
153 | ret = PTR_ERR(dir_item); | ||
154 | if (ret == -EEXIST) | ||
155 | goto second_insert; | ||
156 | goto out; | ||
157 | } | ||
158 | |||
159 | leaf = path->nodes[0]; | ||
160 | btrfs_cpu_key_to_disk(&disk_key, location); | ||
161 | btrfs_set_dir_item_key(leaf, dir_item, &disk_key); | ||
162 | btrfs_set_dir_type(leaf, dir_item, type); | ||
163 | btrfs_set_dir_data_len(leaf, dir_item, 0); | ||
164 | btrfs_set_dir_name_len(leaf, dir_item, name_len); | ||
165 | btrfs_set_dir_transid(leaf, dir_item, trans->transid); | ||
166 | name_ptr = (unsigned long)(dir_item + 1); | ||
167 | |||
168 | write_extent_buffer(leaf, name, name_ptr, name_len); | ||
169 | btrfs_mark_buffer_dirty(leaf); | ||
170 | |||
171 | second_insert: | ||
172 | /* FIXME, use some real flag for selecting the extra index */ | ||
173 | if (root == root->fs_info->tree_root) { | ||
174 | ret = 0; | ||
175 | goto out; | ||
176 | } | ||
177 | btrfs_release_path(root, path); | ||
178 | |||
179 | btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); | ||
180 | key.offset = index; | ||
181 | dir_item = insert_with_overflow(trans, root, path, &key, data_size, | ||
182 | name, name_len); | ||
183 | if (IS_ERR(dir_item)) { | ||
184 | ret2 = PTR_ERR(dir_item); | ||
185 | goto out; | ||
186 | } | ||
187 | leaf = path->nodes[0]; | ||
188 | btrfs_cpu_key_to_disk(&disk_key, location); | ||
189 | btrfs_set_dir_item_key(leaf, dir_item, &disk_key); | ||
190 | btrfs_set_dir_type(leaf, dir_item, type); | ||
191 | btrfs_set_dir_data_len(leaf, dir_item, 0); | ||
192 | btrfs_set_dir_name_len(leaf, dir_item, name_len); | ||
193 | btrfs_set_dir_transid(leaf, dir_item, trans->transid); | ||
194 | name_ptr = (unsigned long)(dir_item + 1); | ||
195 | write_extent_buffer(leaf, name, name_ptr, name_len); | ||
196 | btrfs_mark_buffer_dirty(leaf); | ||
197 | out: | ||
198 | btrfs_free_path(path); | ||
199 | if (ret) | ||
200 | return ret; | ||
201 | if (ret2) | ||
202 | return ret2; | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * lookup a directory item based on name. 'dir' is the objectid | ||
208 | * we're searching in, and 'mod' tells us if you plan on deleting the | ||
209 | * item (use mod < 0) or changing the options (use mod > 0) | ||
210 | */ | ||
211 | struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, | ||
212 | struct btrfs_root *root, | ||
213 | struct btrfs_path *path, u64 dir, | ||
214 | const char *name, int name_len, | ||
215 | int mod) | ||
216 | { | ||
217 | int ret; | ||
218 | struct btrfs_key key; | ||
219 | int ins_len = mod < 0 ? -1 : 0; | ||
220 | int cow = mod != 0; | ||
221 | struct btrfs_key found_key; | ||
222 | struct extent_buffer *leaf; | ||
223 | |||
224 | key.objectid = dir; | ||
225 | btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); | ||
226 | |||
227 | key.offset = btrfs_name_hash(name, name_len); | ||
228 | |||
229 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); | ||
230 | if (ret < 0) | ||
231 | return ERR_PTR(ret); | ||
232 | if (ret > 0) { | ||
233 | if (path->slots[0] == 0) | ||
234 | return NULL; | ||
235 | path->slots[0]--; | ||
236 | } | ||
237 | |||
238 | leaf = path->nodes[0]; | ||
239 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
240 | |||
241 | if (found_key.objectid != dir || | ||
242 | btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || | ||
243 | found_key.offset != key.offset) | ||
244 | return NULL; | ||
245 | |||
246 | return btrfs_match_dir_item_name(root, path, name, name_len); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * lookup a directory item based on index. 'dir' is the objectid | ||
251 | * we're searching in, and 'mod' tells us if you plan on deleting the | ||
252 | * item (use mod < 0) or changing the options (use mod > 0) | ||
253 | * | ||
254 | * The name is used to make sure the index really points to the name you were | ||
255 | * looking for. | ||
256 | */ | ||
257 | struct btrfs_dir_item * | ||
258 | btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, | ||
259 | struct btrfs_root *root, | ||
260 | struct btrfs_path *path, u64 dir, | ||
261 | u64 objectid, const char *name, int name_len, | ||
262 | int mod) | ||
263 | { | ||
264 | int ret; | ||
265 | struct btrfs_key key; | ||
266 | int ins_len = mod < 0 ? -1 : 0; | ||
267 | int cow = mod != 0; | ||
268 | |||
269 | key.objectid = dir; | ||
270 | btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); | ||
271 | key.offset = objectid; | ||
272 | |||
273 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); | ||
274 | if (ret < 0) | ||
275 | return ERR_PTR(ret); | ||
276 | if (ret > 0) | ||
277 | return ERR_PTR(-ENOENT); | ||
278 | return btrfs_match_dir_item_name(root, path, name, name_len); | ||
279 | } | ||
280 | |||
281 | struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | ||
282 | struct btrfs_root *root, | ||
283 | struct btrfs_path *path, u64 dir, | ||
284 | const char *name, u16 name_len, | ||
285 | int mod) | ||
286 | { | ||
287 | int ret; | ||
288 | struct btrfs_key key; | ||
289 | int ins_len = mod < 0 ? -1 : 0; | ||
290 | int cow = mod != 0; | ||
291 | struct btrfs_key found_key; | ||
292 | struct extent_buffer *leaf; | ||
293 | |||
294 | key.objectid = dir; | ||
295 | btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); | ||
296 | key.offset = btrfs_name_hash(name, name_len); | ||
297 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); | ||
298 | if (ret < 0) | ||
299 | return ERR_PTR(ret); | ||
300 | if (ret > 0) { | ||
301 | if (path->slots[0] == 0) | ||
302 | return NULL; | ||
303 | path->slots[0]--; | ||
304 | } | ||
305 | |||
306 | leaf = path->nodes[0]; | ||
307 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
308 | |||
309 | if (found_key.objectid != dir || | ||
310 | btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || | ||
311 | found_key.offset != key.offset) | ||
312 | return NULL; | ||
313 | |||
314 | return btrfs_match_dir_item_name(root, path, name, name_len); | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * helper function to look at the directory item pointed to by 'path' | ||
319 | * this walks through all the entries in a dir item and finds one | ||
320 | * for a specific name. | ||
321 | */ | ||
322 | struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, | ||
323 | struct btrfs_path *path, | ||
324 | const char *name, int name_len) | ||
325 | { | ||
326 | struct btrfs_dir_item *dir_item; | ||
327 | unsigned long name_ptr; | ||
328 | u32 total_len; | ||
329 | u32 cur = 0; | ||
330 | u32 this_len; | ||
331 | struct extent_buffer *leaf; | ||
332 | |||
333 | leaf = path->nodes[0]; | ||
334 | dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); | ||
335 | total_len = btrfs_item_size_nr(leaf, path->slots[0]); | ||
336 | while (cur < total_len) { | ||
337 | this_len = sizeof(*dir_item) + | ||
338 | btrfs_dir_name_len(leaf, dir_item) + | ||
339 | btrfs_dir_data_len(leaf, dir_item); | ||
340 | name_ptr = (unsigned long)(dir_item + 1); | ||
341 | |||
342 | if (btrfs_dir_name_len(leaf, dir_item) == name_len && | ||
343 | memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) | ||
344 | return dir_item; | ||
345 | |||
346 | cur += this_len; | ||
347 | dir_item = (struct btrfs_dir_item *)((char *)dir_item + | ||
348 | this_len); | ||
349 | } | ||
350 | return NULL; | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * given a pointer into a directory item, delete it. This | ||
355 | * handles items that have more than one entry in them. | ||
356 | */ | ||
357 | int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, | ||
358 | struct btrfs_root *root, | ||
359 | struct btrfs_path *path, | ||
360 | struct btrfs_dir_item *di) | ||
361 | { | ||
362 | |||
363 | struct extent_buffer *leaf; | ||
364 | u32 sub_item_len; | ||
365 | u32 item_len; | ||
366 | int ret = 0; | ||
367 | |||
368 | leaf = path->nodes[0]; | ||
369 | sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + | ||
370 | btrfs_dir_data_len(leaf, di); | ||
371 | item_len = btrfs_item_size_nr(leaf, path->slots[0]); | ||
372 | if (sub_item_len == item_len) { | ||
373 | ret = btrfs_del_item(trans, root, path); | ||
374 | } else { | ||
375 | /* MARKER */ | ||
376 | unsigned long ptr = (unsigned long)di; | ||
377 | unsigned long start; | ||
378 | |||
379 | start = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
380 | memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, | ||
381 | item_len - (ptr + sub_item_len - start)); | ||
382 | ret = btrfs_truncate_item(trans, root, path, | ||
383 | item_len - sub_item_len, 1); | ||
384 | } | ||
385 | return 0; | ||
386 | } | ||
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 000000000000..81a313874ae5 --- /dev/null +++ b/fs/btrfs/disk-io.c | |||
@@ -0,0 +1,2343 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/version.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/scatterlist.h> | ||
23 | #include <linux/swap.h> | ||
24 | #include <linux/radix-tree.h> | ||
25 | #include <linux/writeback.h> | ||
26 | #include <linux/buffer_head.h> | ||
27 | #include <linux/workqueue.h> | ||
28 | #include <linux/kthread.h> | ||
29 | #include <linux/freezer.h> | ||
30 | #include "compat.h" | ||
31 | #include "crc32c.h" | ||
32 | #include "ctree.h" | ||
33 | #include "disk-io.h" | ||
34 | #include "transaction.h" | ||
35 | #include "btrfs_inode.h" | ||
36 | #include "volumes.h" | ||
37 | #include "print-tree.h" | ||
38 | #include "async-thread.h" | ||
39 | #include "locking.h" | ||
40 | #include "ref-cache.h" | ||
41 | #include "tree-log.h" | ||
42 | |||
43 | static struct extent_io_ops btree_extent_io_ops; | ||
44 | static void end_workqueue_fn(struct btrfs_work *work); | ||
45 | |||
46 | /* | ||
47 | * end_io_wq structs are used to do processing in task context when an IO is | ||
48 | * complete. This is used during reads to verify checksums, and it is used | ||
49 | * by writes to insert metadata for new file extents after IO is complete. | ||
50 | */ | ||
51 | struct end_io_wq { | ||
52 | struct bio *bio; | ||
53 | bio_end_io_t *end_io; | ||
54 | void *private; | ||
55 | struct btrfs_fs_info *info; | ||
56 | int error; | ||
57 | int metadata; | ||
58 | struct list_head list; | ||
59 | struct btrfs_work work; | ||
60 | }; | ||
61 | |||
62 | /* | ||
63 | * async submit bios are used to offload expensive checksumming | ||
64 | * onto the worker threads. They checksum file and metadata bios | ||
65 | * just before they are sent down the IO stack. | ||
66 | */ | ||
67 | struct async_submit_bio { | ||
68 | struct inode *inode; | ||
69 | struct bio *bio; | ||
70 | struct list_head list; | ||
71 | extent_submit_bio_hook_t *submit_bio_start; | ||
72 | extent_submit_bio_hook_t *submit_bio_done; | ||
73 | int rw; | ||
74 | int mirror_num; | ||
75 | unsigned long bio_flags; | ||
76 | struct btrfs_work work; | ||
77 | }; | ||
78 | |||
79 | /* | ||
80 | * extents on the btree inode are pretty simple, there's one extent | ||
81 | * that covers the entire device | ||
82 | */ | ||
83 | static struct extent_map *btree_get_extent(struct inode *inode, | ||
84 | struct page *page, size_t page_offset, u64 start, u64 len, | ||
85 | int create) | ||
86 | { | ||
87 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
88 | struct extent_map *em; | ||
89 | int ret; | ||
90 | |||
91 | spin_lock(&em_tree->lock); | ||
92 | em = lookup_extent_mapping(em_tree, start, len); | ||
93 | if (em) { | ||
94 | em->bdev = | ||
95 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
96 | spin_unlock(&em_tree->lock); | ||
97 | goto out; | ||
98 | } | ||
99 | spin_unlock(&em_tree->lock); | ||
100 | |||
101 | em = alloc_extent_map(GFP_NOFS); | ||
102 | if (!em) { | ||
103 | em = ERR_PTR(-ENOMEM); | ||
104 | goto out; | ||
105 | } | ||
106 | em->start = 0; | ||
107 | em->len = (u64)-1; | ||
108 | em->block_len = (u64)-1; | ||
109 | em->block_start = 0; | ||
110 | em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
111 | |||
112 | spin_lock(&em_tree->lock); | ||
113 | ret = add_extent_mapping(em_tree, em); | ||
114 | if (ret == -EEXIST) { | ||
115 | u64 failed_start = em->start; | ||
116 | u64 failed_len = em->len; | ||
117 | |||
118 | free_extent_map(em); | ||
119 | em = lookup_extent_mapping(em_tree, start, len); | ||
120 | if (em) { | ||
121 | ret = 0; | ||
122 | } else { | ||
123 | em = lookup_extent_mapping(em_tree, failed_start, | ||
124 | failed_len); | ||
125 | ret = -EIO; | ||
126 | } | ||
127 | } else if (ret) { | ||
128 | free_extent_map(em); | ||
129 | em = NULL; | ||
130 | } | ||
131 | spin_unlock(&em_tree->lock); | ||
132 | |||
133 | if (ret) | ||
134 | em = ERR_PTR(ret); | ||
135 | out: | ||
136 | return em; | ||
137 | } | ||
138 | |||
139 | u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) | ||
140 | { | ||
141 | return btrfs_crc32c(seed, data, len); | ||
142 | } | ||
143 | |||
144 | void btrfs_csum_final(u32 crc, char *result) | ||
145 | { | ||
146 | *(__le32 *)result = ~cpu_to_le32(crc); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * compute the csum for a btree block, and either verify it or write it | ||
151 | * into the csum field of the block. | ||
152 | */ | ||
153 | static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, | ||
154 | int verify) | ||
155 | { | ||
156 | u16 csum_size = | ||
157 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
158 | char *result = NULL; | ||
159 | unsigned long len; | ||
160 | unsigned long cur_len; | ||
161 | unsigned long offset = BTRFS_CSUM_SIZE; | ||
162 | char *map_token = NULL; | ||
163 | char *kaddr; | ||
164 | unsigned long map_start; | ||
165 | unsigned long map_len; | ||
166 | int err; | ||
167 | u32 crc = ~(u32)0; | ||
168 | unsigned long inline_result; | ||
169 | |||
170 | len = buf->len - offset; | ||
171 | while (len > 0) { | ||
172 | err = map_private_extent_buffer(buf, offset, 32, | ||
173 | &map_token, &kaddr, | ||
174 | &map_start, &map_len, KM_USER0); | ||
175 | if (err) | ||
176 | return 1; | ||
177 | cur_len = min(len, map_len - (offset - map_start)); | ||
178 | crc = btrfs_csum_data(root, kaddr + offset - map_start, | ||
179 | crc, cur_len); | ||
180 | len -= cur_len; | ||
181 | offset += cur_len; | ||
182 | unmap_extent_buffer(buf, map_token, KM_USER0); | ||
183 | } | ||
184 | if (csum_size > sizeof(inline_result)) { | ||
185 | result = kzalloc(csum_size * sizeof(char), GFP_NOFS); | ||
186 | if (!result) | ||
187 | return 1; | ||
188 | } else { | ||
189 | result = (char *)&inline_result; | ||
190 | } | ||
191 | |||
192 | btrfs_csum_final(crc, result); | ||
193 | |||
194 | if (verify) { | ||
195 | if (memcmp_extent_buffer(buf, result, 0, csum_size)) { | ||
196 | u32 val; | ||
197 | u32 found = 0; | ||
198 | memcpy(&found, result, csum_size); | ||
199 | |||
200 | read_extent_buffer(buf, &val, 0, csum_size); | ||
201 | printk(KERN_INFO "btrfs: %s checksum verify failed " | ||
202 | "on %llu wanted %X found %X level %d\n", | ||
203 | root->fs_info->sb->s_id, | ||
204 | buf->start, val, found, btrfs_header_level(buf)); | ||
205 | if (result != (char *)&inline_result) | ||
206 | kfree(result); | ||
207 | return 1; | ||
208 | } | ||
209 | } else { | ||
210 | write_extent_buffer(buf, result, 0, csum_size); | ||
211 | } | ||
212 | if (result != (char *)&inline_result) | ||
213 | kfree(result); | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * we can't consider a given block up to date unless the transid of the | ||
219 | * block matches the transid in the parent node's pointer. This is how we | ||
220 | * detect blocks that either didn't get written at all or got written | ||
221 | * in the wrong place. | ||
222 | */ | ||
223 | static int verify_parent_transid(struct extent_io_tree *io_tree, | ||
224 | struct extent_buffer *eb, u64 parent_transid) | ||
225 | { | ||
226 | int ret; | ||
227 | |||
228 | if (!parent_transid || btrfs_header_generation(eb) == parent_transid) | ||
229 | return 0; | ||
230 | |||
231 | lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); | ||
232 | if (extent_buffer_uptodate(io_tree, eb) && | ||
233 | btrfs_header_generation(eb) == parent_transid) { | ||
234 | ret = 0; | ||
235 | goto out; | ||
236 | } | ||
237 | printk("parent transid verify failed on %llu wanted %llu found %llu\n", | ||
238 | (unsigned long long)eb->start, | ||
239 | (unsigned long long)parent_transid, | ||
240 | (unsigned long long)btrfs_header_generation(eb)); | ||
241 | ret = 1; | ||
242 | clear_extent_buffer_uptodate(io_tree, eb); | ||
243 | out: | ||
244 | unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, | ||
245 | GFP_NOFS); | ||
246 | return ret; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * helper to read a given tree block, doing retries as required when | ||
251 | * the checksums don't match and we have alternate mirrors to try. | ||
252 | */ | ||
253 | static int btree_read_extent_buffer_pages(struct btrfs_root *root, | ||
254 | struct extent_buffer *eb, | ||
255 | u64 start, u64 parent_transid) | ||
256 | { | ||
257 | struct extent_io_tree *io_tree; | ||
258 | int ret; | ||
259 | int num_copies = 0; | ||
260 | int mirror_num = 0; | ||
261 | |||
262 | io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; | ||
263 | while (1) { | ||
264 | ret = read_extent_buffer_pages(io_tree, eb, start, 1, | ||
265 | btree_get_extent, mirror_num); | ||
266 | if (!ret && | ||
267 | !verify_parent_transid(io_tree, eb, parent_transid)) | ||
268 | return ret; | ||
269 | |||
270 | num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, | ||
271 | eb->start, eb->len); | ||
272 | if (num_copies == 1) | ||
273 | return ret; | ||
274 | |||
275 | mirror_num++; | ||
276 | if (mirror_num > num_copies) | ||
277 | return ret; | ||
278 | } | ||
279 | return -EIO; | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * checksum a dirty tree block before IO. This has extra checks to make sure | ||
284 | * we only fill in the checksum field in the first page of a multi-page block | ||
285 | */ | ||
286 | |||
287 | static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) | ||
288 | { | ||
289 | struct extent_io_tree *tree; | ||
290 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
291 | u64 found_start; | ||
292 | int found_level; | ||
293 | unsigned long len; | ||
294 | struct extent_buffer *eb; | ||
295 | int ret; | ||
296 | |||
297 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
298 | |||
299 | if (page->private == EXTENT_PAGE_PRIVATE) | ||
300 | goto out; | ||
301 | if (!page->private) | ||
302 | goto out; | ||
303 | len = page->private >> 2; | ||
304 | WARN_ON(len == 0); | ||
305 | |||
306 | eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); | ||
307 | ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, | ||
308 | btrfs_header_generation(eb)); | ||
309 | BUG_ON(ret); | ||
310 | found_start = btrfs_header_bytenr(eb); | ||
311 | if (found_start != start) { | ||
312 | WARN_ON(1); | ||
313 | goto err; | ||
314 | } | ||
315 | if (eb->first_page != page) { | ||
316 | WARN_ON(1); | ||
317 | goto err; | ||
318 | } | ||
319 | if (!PageUptodate(page)) { | ||
320 | WARN_ON(1); | ||
321 | goto err; | ||
322 | } | ||
323 | found_level = btrfs_header_level(eb); | ||
324 | |||
325 | csum_tree_block(root, eb, 0); | ||
326 | err: | ||
327 | free_extent_buffer(eb); | ||
328 | out: | ||
329 | return 0; | ||
330 | } | ||
331 | |||
332 | static int check_tree_block_fsid(struct btrfs_root *root, | ||
333 | struct extent_buffer *eb) | ||
334 | { | ||
335 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; | ||
336 | u8 fsid[BTRFS_UUID_SIZE]; | ||
337 | int ret = 1; | ||
338 | |||
339 | read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), | ||
340 | BTRFS_FSID_SIZE); | ||
341 | while (fs_devices) { | ||
342 | if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { | ||
343 | ret = 0; | ||
344 | break; | ||
345 | } | ||
346 | fs_devices = fs_devices->seed; | ||
347 | } | ||
348 | return ret; | ||
349 | } | ||
350 | |||
351 | static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, | ||
352 | struct extent_state *state) | ||
353 | { | ||
354 | struct extent_io_tree *tree; | ||
355 | u64 found_start; | ||
356 | int found_level; | ||
357 | unsigned long len; | ||
358 | struct extent_buffer *eb; | ||
359 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | ||
360 | int ret = 0; | ||
361 | |||
362 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
363 | if (page->private == EXTENT_PAGE_PRIVATE) | ||
364 | goto out; | ||
365 | if (!page->private) | ||
366 | goto out; | ||
367 | |||
368 | len = page->private >> 2; | ||
369 | WARN_ON(len == 0); | ||
370 | |||
371 | eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); | ||
372 | |||
373 | found_start = btrfs_header_bytenr(eb); | ||
374 | if (found_start != start) { | ||
375 | printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", | ||
376 | (unsigned long long)found_start, | ||
377 | (unsigned long long)eb->start); | ||
378 | ret = -EIO; | ||
379 | goto err; | ||
380 | } | ||
381 | if (eb->first_page != page) { | ||
382 | printk(KERN_INFO "btrfs bad first page %lu %lu\n", | ||
383 | eb->first_page->index, page->index); | ||
384 | WARN_ON(1); | ||
385 | ret = -EIO; | ||
386 | goto err; | ||
387 | } | ||
388 | if (check_tree_block_fsid(root, eb)) { | ||
389 | printk(KERN_INFO "btrfs bad fsid on block %llu\n", | ||
390 | (unsigned long long)eb->start); | ||
391 | ret = -EIO; | ||
392 | goto err; | ||
393 | } | ||
394 | found_level = btrfs_header_level(eb); | ||
395 | |||
396 | ret = csum_tree_block(root, eb, 1); | ||
397 | if (ret) | ||
398 | ret = -EIO; | ||
399 | |||
400 | end = min_t(u64, eb->len, PAGE_CACHE_SIZE); | ||
401 | end = eb->start + end - 1; | ||
402 | err: | ||
403 | free_extent_buffer(eb); | ||
404 | out: | ||
405 | return ret; | ||
406 | } | ||
407 | |||
408 | static void end_workqueue_bio(struct bio *bio, int err) | ||
409 | { | ||
410 | struct end_io_wq *end_io_wq = bio->bi_private; | ||
411 | struct btrfs_fs_info *fs_info; | ||
412 | |||
413 | fs_info = end_io_wq->info; | ||
414 | end_io_wq->error = err; | ||
415 | end_io_wq->work.func = end_workqueue_fn; | ||
416 | end_io_wq->work.flags = 0; | ||
417 | |||
418 | if (bio->bi_rw & (1 << BIO_RW)) { | ||
419 | if (end_io_wq->metadata) | ||
420 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, | ||
421 | &end_io_wq->work); | ||
422 | else | ||
423 | btrfs_queue_worker(&fs_info->endio_write_workers, | ||
424 | &end_io_wq->work); | ||
425 | } else { | ||
426 | if (end_io_wq->metadata) | ||
427 | btrfs_queue_worker(&fs_info->endio_meta_workers, | ||
428 | &end_io_wq->work); | ||
429 | else | ||
430 | btrfs_queue_worker(&fs_info->endio_workers, | ||
431 | &end_io_wq->work); | ||
432 | } | ||
433 | } | ||
434 | |||
435 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | ||
436 | int metadata) | ||
437 | { | ||
438 | struct end_io_wq *end_io_wq; | ||
439 | end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); | ||
440 | if (!end_io_wq) | ||
441 | return -ENOMEM; | ||
442 | |||
443 | end_io_wq->private = bio->bi_private; | ||
444 | end_io_wq->end_io = bio->bi_end_io; | ||
445 | end_io_wq->info = info; | ||
446 | end_io_wq->error = 0; | ||
447 | end_io_wq->bio = bio; | ||
448 | end_io_wq->metadata = metadata; | ||
449 | |||
450 | bio->bi_private = end_io_wq; | ||
451 | bio->bi_end_io = end_workqueue_bio; | ||
452 | return 0; | ||
453 | } | ||
454 | |||
455 | unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) | ||
456 | { | ||
457 | unsigned long limit = min_t(unsigned long, | ||
458 | info->workers.max_workers, | ||
459 | info->fs_devices->open_devices); | ||
460 | return 256 * limit; | ||
461 | } | ||
462 | |||
463 | int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) | ||
464 | { | ||
465 | return atomic_read(&info->nr_async_bios) > | ||
466 | btrfs_async_submit_limit(info); | ||
467 | } | ||
468 | |||
469 | static void run_one_async_start(struct btrfs_work *work) | ||
470 | { | ||
471 | struct btrfs_fs_info *fs_info; | ||
472 | struct async_submit_bio *async; | ||
473 | |||
474 | async = container_of(work, struct async_submit_bio, work); | ||
475 | fs_info = BTRFS_I(async->inode)->root->fs_info; | ||
476 | async->submit_bio_start(async->inode, async->rw, async->bio, | ||
477 | async->mirror_num, async->bio_flags); | ||
478 | } | ||
479 | |||
480 | static void run_one_async_done(struct btrfs_work *work) | ||
481 | { | ||
482 | struct btrfs_fs_info *fs_info; | ||
483 | struct async_submit_bio *async; | ||
484 | int limit; | ||
485 | |||
486 | async = container_of(work, struct async_submit_bio, work); | ||
487 | fs_info = BTRFS_I(async->inode)->root->fs_info; | ||
488 | |||
489 | limit = btrfs_async_submit_limit(fs_info); | ||
490 | limit = limit * 2 / 3; | ||
491 | |||
492 | atomic_dec(&fs_info->nr_async_submits); | ||
493 | |||
494 | if (atomic_read(&fs_info->nr_async_submits) < limit && | ||
495 | waitqueue_active(&fs_info->async_submit_wait)) | ||
496 | wake_up(&fs_info->async_submit_wait); | ||
497 | |||
498 | async->submit_bio_done(async->inode, async->rw, async->bio, | ||
499 | async->mirror_num, async->bio_flags); | ||
500 | } | ||
501 | |||
502 | static void run_one_async_free(struct btrfs_work *work) | ||
503 | { | ||
504 | struct async_submit_bio *async; | ||
505 | |||
506 | async = container_of(work, struct async_submit_bio, work); | ||
507 | kfree(async); | ||
508 | } | ||
509 | |||
510 | int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, | ||
511 | int rw, struct bio *bio, int mirror_num, | ||
512 | unsigned long bio_flags, | ||
513 | extent_submit_bio_hook_t *submit_bio_start, | ||
514 | extent_submit_bio_hook_t *submit_bio_done) | ||
515 | { | ||
516 | struct async_submit_bio *async; | ||
517 | |||
518 | async = kmalloc(sizeof(*async), GFP_NOFS); | ||
519 | if (!async) | ||
520 | return -ENOMEM; | ||
521 | |||
522 | async->inode = inode; | ||
523 | async->rw = rw; | ||
524 | async->bio = bio; | ||
525 | async->mirror_num = mirror_num; | ||
526 | async->submit_bio_start = submit_bio_start; | ||
527 | async->submit_bio_done = submit_bio_done; | ||
528 | |||
529 | async->work.func = run_one_async_start; | ||
530 | async->work.ordered_func = run_one_async_done; | ||
531 | async->work.ordered_free = run_one_async_free; | ||
532 | |||
533 | async->work.flags = 0; | ||
534 | async->bio_flags = bio_flags; | ||
535 | |||
536 | atomic_inc(&fs_info->nr_async_submits); | ||
537 | btrfs_queue_worker(&fs_info->workers, &async->work); | ||
538 | #if 0 | ||
539 | int limit = btrfs_async_submit_limit(fs_info); | ||
540 | if (atomic_read(&fs_info->nr_async_submits) > limit) { | ||
541 | wait_event_timeout(fs_info->async_submit_wait, | ||
542 | (atomic_read(&fs_info->nr_async_submits) < limit), | ||
543 | HZ/10); | ||
544 | |||
545 | wait_event_timeout(fs_info->async_submit_wait, | ||
546 | (atomic_read(&fs_info->nr_async_bios) < limit), | ||
547 | HZ/10); | ||
548 | } | ||
549 | #endif | ||
550 | while (atomic_read(&fs_info->async_submit_draining) && | ||
551 | atomic_read(&fs_info->nr_async_submits)) { | ||
552 | wait_event(fs_info->async_submit_wait, | ||
553 | (atomic_read(&fs_info->nr_async_submits) == 0)); | ||
554 | } | ||
555 | |||
556 | return 0; | ||
557 | } | ||
558 | |||
559 | static int btree_csum_one_bio(struct bio *bio) | ||
560 | { | ||
561 | struct bio_vec *bvec = bio->bi_io_vec; | ||
562 | int bio_index = 0; | ||
563 | struct btrfs_root *root; | ||
564 | |||
565 | WARN_ON(bio->bi_vcnt <= 0); | ||
566 | while (bio_index < bio->bi_vcnt) { | ||
567 | root = BTRFS_I(bvec->bv_page->mapping->host)->root; | ||
568 | csum_dirty_buffer(root, bvec->bv_page); | ||
569 | bio_index++; | ||
570 | bvec++; | ||
571 | } | ||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static int __btree_submit_bio_start(struct inode *inode, int rw, | ||
576 | struct bio *bio, int mirror_num, | ||
577 | unsigned long bio_flags) | ||
578 | { | ||
579 | /* | ||
580 | * when we're called for a write, we're already in the async | ||
581 | * submission context. Just jump into btrfs_map_bio | ||
582 | */ | ||
583 | btree_csum_one_bio(bio); | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, | ||
588 | int mirror_num, unsigned long bio_flags) | ||
589 | { | ||
590 | /* | ||
591 | * when we're called for a write, we're already in the async | ||
592 | * submission context. Just jump into btrfs_map_bio | ||
593 | */ | ||
594 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); | ||
595 | } | ||
596 | |||
597 | static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | ||
598 | int mirror_num, unsigned long bio_flags) | ||
599 | { | ||
600 | int ret; | ||
601 | |||
602 | ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, | ||
603 | bio, 1); | ||
604 | BUG_ON(ret); | ||
605 | |||
606 | if (!(rw & (1 << BIO_RW))) { | ||
607 | /* | ||
608 | * called for a read, do the setup so that checksum validation | ||
609 | * can happen in the async kernel threads | ||
610 | */ | ||
611 | return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, | ||
612 | mirror_num, 0); | ||
613 | } | ||
614 | /* | ||
615 | * kthread helpers are used to submit writes so that checksumming | ||
616 | * can happen in parallel across all CPUs | ||
617 | */ | ||
618 | return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, | ||
619 | inode, rw, bio, mirror_num, 0, | ||
620 | __btree_submit_bio_start, | ||
621 | __btree_submit_bio_done); | ||
622 | } | ||
623 | |||
624 | static int btree_writepage(struct page *page, struct writeback_control *wbc) | ||
625 | { | ||
626 | struct extent_io_tree *tree; | ||
627 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
628 | |||
629 | if (current->flags & PF_MEMALLOC) { | ||
630 | redirty_page_for_writepage(wbc, page); | ||
631 | unlock_page(page); | ||
632 | return 0; | ||
633 | } | ||
634 | return extent_write_full_page(tree, page, btree_get_extent, wbc); | ||
635 | } | ||
636 | |||
637 | static int btree_writepages(struct address_space *mapping, | ||
638 | struct writeback_control *wbc) | ||
639 | { | ||
640 | struct extent_io_tree *tree; | ||
641 | tree = &BTRFS_I(mapping->host)->io_tree; | ||
642 | if (wbc->sync_mode == WB_SYNC_NONE) { | ||
643 | u64 num_dirty; | ||
644 | u64 start = 0; | ||
645 | unsigned long thresh = 32 * 1024 * 1024; | ||
646 | |||
647 | if (wbc->for_kupdate) | ||
648 | return 0; | ||
649 | |||
650 | num_dirty = count_range_bits(tree, &start, (u64)-1, | ||
651 | thresh, EXTENT_DIRTY); | ||
652 | if (num_dirty < thresh) | ||
653 | return 0; | ||
654 | } | ||
655 | return extent_writepages(tree, mapping, btree_get_extent, wbc); | ||
656 | } | ||
657 | |||
658 | static int btree_readpage(struct file *file, struct page *page) | ||
659 | { | ||
660 | struct extent_io_tree *tree; | ||
661 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
662 | return extent_read_full_page(tree, page, btree_get_extent); | ||
663 | } | ||
664 | |||
665 | static int btree_releasepage(struct page *page, gfp_t gfp_flags) | ||
666 | { | ||
667 | struct extent_io_tree *tree; | ||
668 | struct extent_map_tree *map; | ||
669 | int ret; | ||
670 | |||
671 | if (PageWriteback(page) || PageDirty(page)) | ||
672 | return 0; | ||
673 | |||
674 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
675 | map = &BTRFS_I(page->mapping->host)->extent_tree; | ||
676 | |||
677 | ret = try_release_extent_state(map, tree, page, gfp_flags); | ||
678 | if (!ret) | ||
679 | return 0; | ||
680 | |||
681 | ret = try_release_extent_buffer(tree, page); | ||
682 | if (ret == 1) { | ||
683 | ClearPagePrivate(page); | ||
684 | set_page_private(page, 0); | ||
685 | page_cache_release(page); | ||
686 | } | ||
687 | |||
688 | return ret; | ||
689 | } | ||
690 | |||
691 | static void btree_invalidatepage(struct page *page, unsigned long offset) | ||
692 | { | ||
693 | struct extent_io_tree *tree; | ||
694 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
695 | extent_invalidatepage(tree, page, offset); | ||
696 | btree_releasepage(page, GFP_NOFS); | ||
697 | if (PagePrivate(page)) { | ||
698 | printk(KERN_WARNING "btrfs warning page private not zero " | ||
699 | "on page %llu\n", (unsigned long long)page_offset(page)); | ||
700 | ClearPagePrivate(page); | ||
701 | set_page_private(page, 0); | ||
702 | page_cache_release(page); | ||
703 | } | ||
704 | } | ||
705 | |||
706 | #if 0 | ||
707 | static int btree_writepage(struct page *page, struct writeback_control *wbc) | ||
708 | { | ||
709 | struct buffer_head *bh; | ||
710 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | ||
711 | struct buffer_head *head; | ||
712 | if (!page_has_buffers(page)) { | ||
713 | create_empty_buffers(page, root->fs_info->sb->s_blocksize, | ||
714 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | ||
715 | } | ||
716 | head = page_buffers(page); | ||
717 | bh = head; | ||
718 | do { | ||
719 | if (buffer_dirty(bh)) | ||
720 | csum_tree_block(root, bh, 0); | ||
721 | bh = bh->b_this_page; | ||
722 | } while (bh != head); | ||
723 | return block_write_full_page(page, btree_get_block, wbc); | ||
724 | } | ||
725 | #endif | ||
726 | |||
727 | static struct address_space_operations btree_aops = { | ||
728 | .readpage = btree_readpage, | ||
729 | .writepage = btree_writepage, | ||
730 | .writepages = btree_writepages, | ||
731 | .releasepage = btree_releasepage, | ||
732 | .invalidatepage = btree_invalidatepage, | ||
733 | .sync_page = block_sync_page, | ||
734 | }; | ||
735 | |||
736 | int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, | ||
737 | u64 parent_transid) | ||
738 | { | ||
739 | struct extent_buffer *buf = NULL; | ||
740 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
741 | int ret = 0; | ||
742 | |||
743 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); | ||
744 | if (!buf) | ||
745 | return 0; | ||
746 | read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, | ||
747 | buf, 0, 0, btree_get_extent, 0); | ||
748 | free_extent_buffer(buf); | ||
749 | return ret; | ||
750 | } | ||
751 | |||
752 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, | ||
753 | u64 bytenr, u32 blocksize) | ||
754 | { | ||
755 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
756 | struct extent_buffer *eb; | ||
757 | eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, | ||
758 | bytenr, blocksize, GFP_NOFS); | ||
759 | return eb; | ||
760 | } | ||
761 | |||
762 | struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, | ||
763 | u64 bytenr, u32 blocksize) | ||
764 | { | ||
765 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
766 | struct extent_buffer *eb; | ||
767 | |||
768 | eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, | ||
769 | bytenr, blocksize, NULL, GFP_NOFS); | ||
770 | return eb; | ||
771 | } | ||
772 | |||
773 | |||
774 | int btrfs_write_tree_block(struct extent_buffer *buf) | ||
775 | { | ||
776 | return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, | ||
777 | buf->start + buf->len - 1, WB_SYNC_ALL); | ||
778 | } | ||
779 | |||
780 | int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) | ||
781 | { | ||
782 | return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, | ||
783 | buf->start, buf->start + buf->len - 1); | ||
784 | } | ||
785 | |||
786 | struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | ||
787 | u32 blocksize, u64 parent_transid) | ||
788 | { | ||
789 | struct extent_buffer *buf = NULL; | ||
790 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
791 | struct extent_io_tree *io_tree; | ||
792 | int ret; | ||
793 | |||
794 | io_tree = &BTRFS_I(btree_inode)->io_tree; | ||
795 | |||
796 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); | ||
797 | if (!buf) | ||
798 | return NULL; | ||
799 | |||
800 | ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); | ||
801 | |||
802 | if (ret == 0) | ||
803 | buf->flags |= EXTENT_UPTODATE; | ||
804 | else | ||
805 | WARN_ON(1); | ||
806 | return buf; | ||
807 | |||
808 | } | ||
809 | |||
810 | int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
811 | struct extent_buffer *buf) | ||
812 | { | ||
813 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
814 | if (btrfs_header_generation(buf) == | ||
815 | root->fs_info->running_transaction->transid) { | ||
816 | WARN_ON(!btrfs_tree_locked(buf)); | ||
817 | clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, | ||
818 | buf); | ||
819 | } | ||
820 | return 0; | ||
821 | } | ||
822 | |||
823 | static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, | ||
824 | u32 stripesize, struct btrfs_root *root, | ||
825 | struct btrfs_fs_info *fs_info, | ||
826 | u64 objectid) | ||
827 | { | ||
828 | root->node = NULL; | ||
829 | root->commit_root = NULL; | ||
830 | root->ref_tree = NULL; | ||
831 | root->sectorsize = sectorsize; | ||
832 | root->nodesize = nodesize; | ||
833 | root->leafsize = leafsize; | ||
834 | root->stripesize = stripesize; | ||
835 | root->ref_cows = 0; | ||
836 | root->track_dirty = 0; | ||
837 | |||
838 | root->fs_info = fs_info; | ||
839 | root->objectid = objectid; | ||
840 | root->last_trans = 0; | ||
841 | root->highest_inode = 0; | ||
842 | root->last_inode_alloc = 0; | ||
843 | root->name = NULL; | ||
844 | root->in_sysfs = 0; | ||
845 | |||
846 | INIT_LIST_HEAD(&root->dirty_list); | ||
847 | INIT_LIST_HEAD(&root->orphan_list); | ||
848 | INIT_LIST_HEAD(&root->dead_list); | ||
849 | spin_lock_init(&root->node_lock); | ||
850 | spin_lock_init(&root->list_lock); | ||
851 | mutex_init(&root->objectid_mutex); | ||
852 | mutex_init(&root->log_mutex); | ||
853 | extent_io_tree_init(&root->dirty_log_pages, | ||
854 | fs_info->btree_inode->i_mapping, GFP_NOFS); | ||
855 | |||
856 | btrfs_leaf_ref_tree_init(&root->ref_tree_struct); | ||
857 | root->ref_tree = &root->ref_tree_struct; | ||
858 | |||
859 | memset(&root->root_key, 0, sizeof(root->root_key)); | ||
860 | memset(&root->root_item, 0, sizeof(root->root_item)); | ||
861 | memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); | ||
862 | memset(&root->root_kobj, 0, sizeof(root->root_kobj)); | ||
863 | root->defrag_trans_start = fs_info->generation; | ||
864 | init_completion(&root->kobj_unregister); | ||
865 | root->defrag_running = 0; | ||
866 | root->defrag_level = 0; | ||
867 | root->root_key.objectid = objectid; | ||
868 | root->anon_super.s_root = NULL; | ||
869 | root->anon_super.s_dev = 0; | ||
870 | INIT_LIST_HEAD(&root->anon_super.s_list); | ||
871 | INIT_LIST_HEAD(&root->anon_super.s_instances); | ||
872 | init_rwsem(&root->anon_super.s_umount); | ||
873 | |||
874 | return 0; | ||
875 | } | ||
876 | |||
877 | static int find_and_setup_root(struct btrfs_root *tree_root, | ||
878 | struct btrfs_fs_info *fs_info, | ||
879 | u64 objectid, | ||
880 | struct btrfs_root *root) | ||
881 | { | ||
882 | int ret; | ||
883 | u32 blocksize; | ||
884 | u64 generation; | ||
885 | |||
886 | __setup_root(tree_root->nodesize, tree_root->leafsize, | ||
887 | tree_root->sectorsize, tree_root->stripesize, | ||
888 | root, fs_info, objectid); | ||
889 | ret = btrfs_find_last_root(tree_root, objectid, | ||
890 | &root->root_item, &root->root_key); | ||
891 | BUG_ON(ret); | ||
892 | |||
893 | generation = btrfs_root_generation(&root->root_item); | ||
894 | blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); | ||
895 | root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), | ||
896 | blocksize, generation); | ||
897 | BUG_ON(!root->node); | ||
898 | return 0; | ||
899 | } | ||
900 | |||
901 | int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, | ||
902 | struct btrfs_fs_info *fs_info) | ||
903 | { | ||
904 | struct extent_buffer *eb; | ||
905 | struct btrfs_root *log_root_tree = fs_info->log_root_tree; | ||
906 | u64 start = 0; | ||
907 | u64 end = 0; | ||
908 | int ret; | ||
909 | |||
910 | if (!log_root_tree) | ||
911 | return 0; | ||
912 | |||
913 | while (1) { | ||
914 | ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, | ||
915 | 0, &start, &end, EXTENT_DIRTY); | ||
916 | if (ret) | ||
917 | break; | ||
918 | |||
919 | clear_extent_dirty(&log_root_tree->dirty_log_pages, | ||
920 | start, end, GFP_NOFS); | ||
921 | } | ||
922 | eb = fs_info->log_root_tree->node; | ||
923 | |||
924 | WARN_ON(btrfs_header_level(eb) != 0); | ||
925 | WARN_ON(btrfs_header_nritems(eb) != 0); | ||
926 | |||
927 | ret = btrfs_free_reserved_extent(fs_info->tree_root, | ||
928 | eb->start, eb->len); | ||
929 | BUG_ON(ret); | ||
930 | |||
931 | free_extent_buffer(eb); | ||
932 | kfree(fs_info->log_root_tree); | ||
933 | fs_info->log_root_tree = NULL; | ||
934 | return 0; | ||
935 | } | ||
936 | |||
937 | int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, | ||
938 | struct btrfs_fs_info *fs_info) | ||
939 | { | ||
940 | struct btrfs_root *root; | ||
941 | struct btrfs_root *tree_root = fs_info->tree_root; | ||
942 | |||
943 | root = kzalloc(sizeof(*root), GFP_NOFS); | ||
944 | if (!root) | ||
945 | return -ENOMEM; | ||
946 | |||
947 | __setup_root(tree_root->nodesize, tree_root->leafsize, | ||
948 | tree_root->sectorsize, tree_root->stripesize, | ||
949 | root, fs_info, BTRFS_TREE_LOG_OBJECTID); | ||
950 | |||
951 | root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; | ||
952 | root->root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
953 | root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; | ||
954 | root->ref_cows = 0; | ||
955 | |||
956 | root->node = btrfs_alloc_free_block(trans, root, root->leafsize, | ||
957 | 0, BTRFS_TREE_LOG_OBJECTID, | ||
958 | trans->transid, 0, 0, 0); | ||
959 | |||
960 | btrfs_set_header_nritems(root->node, 0); | ||
961 | btrfs_set_header_level(root->node, 0); | ||
962 | btrfs_set_header_bytenr(root->node, root->node->start); | ||
963 | btrfs_set_header_generation(root->node, trans->transid); | ||
964 | btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); | ||
965 | |||
966 | write_extent_buffer(root->node, root->fs_info->fsid, | ||
967 | (unsigned long)btrfs_header_fsid(root->node), | ||
968 | BTRFS_FSID_SIZE); | ||
969 | btrfs_mark_buffer_dirty(root->node); | ||
970 | btrfs_tree_unlock(root->node); | ||
971 | fs_info->log_root_tree = root; | ||
972 | return 0; | ||
973 | } | ||
974 | |||
975 | struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, | ||
976 | struct btrfs_key *location) | ||
977 | { | ||
978 | struct btrfs_root *root; | ||
979 | struct btrfs_fs_info *fs_info = tree_root->fs_info; | ||
980 | struct btrfs_path *path; | ||
981 | struct extent_buffer *l; | ||
982 | u64 highest_inode; | ||
983 | u64 generation; | ||
984 | u32 blocksize; | ||
985 | int ret = 0; | ||
986 | |||
987 | root = kzalloc(sizeof(*root), GFP_NOFS); | ||
988 | if (!root) | ||
989 | return ERR_PTR(-ENOMEM); | ||
990 | if (location->offset == (u64)-1) { | ||
991 | ret = find_and_setup_root(tree_root, fs_info, | ||
992 | location->objectid, root); | ||
993 | if (ret) { | ||
994 | kfree(root); | ||
995 | return ERR_PTR(ret); | ||
996 | } | ||
997 | goto insert; | ||
998 | } | ||
999 | |||
1000 | __setup_root(tree_root->nodesize, tree_root->leafsize, | ||
1001 | tree_root->sectorsize, tree_root->stripesize, | ||
1002 | root, fs_info, location->objectid); | ||
1003 | |||
1004 | path = btrfs_alloc_path(); | ||
1005 | BUG_ON(!path); | ||
1006 | ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); | ||
1007 | if (ret != 0) { | ||
1008 | if (ret > 0) | ||
1009 | ret = -ENOENT; | ||
1010 | goto out; | ||
1011 | } | ||
1012 | l = path->nodes[0]; | ||
1013 | read_extent_buffer(l, &root->root_item, | ||
1014 | btrfs_item_ptr_offset(l, path->slots[0]), | ||
1015 | sizeof(root->root_item)); | ||
1016 | memcpy(&root->root_key, location, sizeof(*location)); | ||
1017 | ret = 0; | ||
1018 | out: | ||
1019 | btrfs_release_path(root, path); | ||
1020 | btrfs_free_path(path); | ||
1021 | if (ret) { | ||
1022 | kfree(root); | ||
1023 | return ERR_PTR(ret); | ||
1024 | } | ||
1025 | generation = btrfs_root_generation(&root->root_item); | ||
1026 | blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); | ||
1027 | root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), | ||
1028 | blocksize, generation); | ||
1029 | BUG_ON(!root->node); | ||
1030 | insert: | ||
1031 | if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { | ||
1032 | root->ref_cows = 1; | ||
1033 | ret = btrfs_find_highest_inode(root, &highest_inode); | ||
1034 | if (ret == 0) { | ||
1035 | root->highest_inode = highest_inode; | ||
1036 | root->last_inode_alloc = highest_inode; | ||
1037 | } | ||
1038 | } | ||
1039 | return root; | ||
1040 | } | ||
1041 | |||
1042 | struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, | ||
1043 | u64 root_objectid) | ||
1044 | { | ||
1045 | struct btrfs_root *root; | ||
1046 | |||
1047 | if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) | ||
1048 | return fs_info->tree_root; | ||
1049 | if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) | ||
1050 | return fs_info->extent_root; | ||
1051 | |||
1052 | root = radix_tree_lookup(&fs_info->fs_roots_radix, | ||
1053 | (unsigned long)root_objectid); | ||
1054 | return root; | ||
1055 | } | ||
1056 | |||
1057 | struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, | ||
1058 | struct btrfs_key *location) | ||
1059 | { | ||
1060 | struct btrfs_root *root; | ||
1061 | int ret; | ||
1062 | |||
1063 | if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) | ||
1064 | return fs_info->tree_root; | ||
1065 | if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) | ||
1066 | return fs_info->extent_root; | ||
1067 | if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) | ||
1068 | return fs_info->chunk_root; | ||
1069 | if (location->objectid == BTRFS_DEV_TREE_OBJECTID) | ||
1070 | return fs_info->dev_root; | ||
1071 | if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) | ||
1072 | return fs_info->csum_root; | ||
1073 | |||
1074 | root = radix_tree_lookup(&fs_info->fs_roots_radix, | ||
1075 | (unsigned long)location->objectid); | ||
1076 | if (root) | ||
1077 | return root; | ||
1078 | |||
1079 | root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); | ||
1080 | if (IS_ERR(root)) | ||
1081 | return root; | ||
1082 | |||
1083 | set_anon_super(&root->anon_super, NULL); | ||
1084 | |||
1085 | ret = radix_tree_insert(&fs_info->fs_roots_radix, | ||
1086 | (unsigned long)root->root_key.objectid, | ||
1087 | root); | ||
1088 | if (ret) { | ||
1089 | free_extent_buffer(root->node); | ||
1090 | kfree(root); | ||
1091 | return ERR_PTR(ret); | ||
1092 | } | ||
1093 | if (!(fs_info->sb->s_flags & MS_RDONLY)) { | ||
1094 | ret = btrfs_find_dead_roots(fs_info->tree_root, | ||
1095 | root->root_key.objectid, root); | ||
1096 | BUG_ON(ret); | ||
1097 | btrfs_orphan_cleanup(root); | ||
1098 | } | ||
1099 | return root; | ||
1100 | } | ||
1101 | |||
1102 | struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, | ||
1103 | struct btrfs_key *location, | ||
1104 | const char *name, int namelen) | ||
1105 | { | ||
1106 | struct btrfs_root *root; | ||
1107 | int ret; | ||
1108 | |||
1109 | root = btrfs_read_fs_root_no_name(fs_info, location); | ||
1110 | if (!root) | ||
1111 | return NULL; | ||
1112 | |||
1113 | if (root->in_sysfs) | ||
1114 | return root; | ||
1115 | |||
1116 | ret = btrfs_set_root_name(root, name, namelen); | ||
1117 | if (ret) { | ||
1118 | free_extent_buffer(root->node); | ||
1119 | kfree(root); | ||
1120 | return ERR_PTR(ret); | ||
1121 | } | ||
1122 | #if 0 | ||
1123 | ret = btrfs_sysfs_add_root(root); | ||
1124 | if (ret) { | ||
1125 | free_extent_buffer(root->node); | ||
1126 | kfree(root->name); | ||
1127 | kfree(root); | ||
1128 | return ERR_PTR(ret); | ||
1129 | } | ||
1130 | #endif | ||
1131 | root->in_sysfs = 1; | ||
1132 | return root; | ||
1133 | } | ||
1134 | |||
1135 | static int btrfs_congested_fn(void *congested_data, int bdi_bits) | ||
1136 | { | ||
1137 | struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; | ||
1138 | int ret = 0; | ||
1139 | struct list_head *cur; | ||
1140 | struct btrfs_device *device; | ||
1141 | struct backing_dev_info *bdi; | ||
1142 | #if 0 | ||
1143 | if ((bdi_bits & (1 << BDI_write_congested)) && | ||
1144 | btrfs_congested_async(info, 0)) | ||
1145 | return 1; | ||
1146 | #endif | ||
1147 | list_for_each(cur, &info->fs_devices->devices) { | ||
1148 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
1149 | if (!device->bdev) | ||
1150 | continue; | ||
1151 | bdi = blk_get_backing_dev_info(device->bdev); | ||
1152 | if (bdi && bdi_congested(bdi, bdi_bits)) { | ||
1153 | ret = 1; | ||
1154 | break; | ||
1155 | } | ||
1156 | } | ||
1157 | return ret; | ||
1158 | } | ||
1159 | |||
1160 | /* | ||
1161 | * this unplugs every device on the box, and it is only used when page | ||
1162 | * is null | ||
1163 | */ | ||
1164 | static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
1165 | { | ||
1166 | struct list_head *cur; | ||
1167 | struct btrfs_device *device; | ||
1168 | struct btrfs_fs_info *info; | ||
1169 | |||
1170 | info = (struct btrfs_fs_info *)bdi->unplug_io_data; | ||
1171 | list_for_each(cur, &info->fs_devices->devices) { | ||
1172 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
1173 | if (!device->bdev) | ||
1174 | continue; | ||
1175 | |||
1176 | bdi = blk_get_backing_dev_info(device->bdev); | ||
1177 | if (bdi->unplug_io_fn) | ||
1178 | bdi->unplug_io_fn(bdi, page); | ||
1179 | } | ||
1180 | } | ||
1181 | |||
1182 | static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
1183 | { | ||
1184 | struct inode *inode; | ||
1185 | struct extent_map_tree *em_tree; | ||
1186 | struct extent_map *em; | ||
1187 | struct address_space *mapping; | ||
1188 | u64 offset; | ||
1189 | |||
1190 | /* the generic O_DIRECT read code does this */ | ||
1191 | if (1 || !page) { | ||
1192 | __unplug_io_fn(bdi, page); | ||
1193 | return; | ||
1194 | } | ||
1195 | |||
1196 | /* | ||
1197 | * page->mapping may change at any time. Get a consistent copy | ||
1198 | * and use that for everything below | ||
1199 | */ | ||
1200 | smp_mb(); | ||
1201 | mapping = page->mapping; | ||
1202 | if (!mapping) | ||
1203 | return; | ||
1204 | |||
1205 | inode = mapping->host; | ||
1206 | |||
1207 | /* | ||
1208 | * don't do the expensive searching for a small number of | ||
1209 | * devices | ||
1210 | */ | ||
1211 | if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { | ||
1212 | __unplug_io_fn(bdi, page); | ||
1213 | return; | ||
1214 | } | ||
1215 | |||
1216 | offset = page_offset(page); | ||
1217 | |||
1218 | em_tree = &BTRFS_I(inode)->extent_tree; | ||
1219 | spin_lock(&em_tree->lock); | ||
1220 | em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); | ||
1221 | spin_unlock(&em_tree->lock); | ||
1222 | if (!em) { | ||
1223 | __unplug_io_fn(bdi, page); | ||
1224 | return; | ||
1225 | } | ||
1226 | |||
1227 | if (em->block_start >= EXTENT_MAP_LAST_BYTE) { | ||
1228 | free_extent_map(em); | ||
1229 | __unplug_io_fn(bdi, page); | ||
1230 | return; | ||
1231 | } | ||
1232 | offset = offset - em->start; | ||
1233 | btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, | ||
1234 | em->block_start + offset, page); | ||
1235 | free_extent_map(em); | ||
1236 | } | ||
1237 | |||
1238 | static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) | ||
1239 | { | ||
1240 | bdi_init(bdi); | ||
1241 | bdi->ra_pages = default_backing_dev_info.ra_pages; | ||
1242 | bdi->state = 0; | ||
1243 | bdi->capabilities = default_backing_dev_info.capabilities; | ||
1244 | bdi->unplug_io_fn = btrfs_unplug_io_fn; | ||
1245 | bdi->unplug_io_data = info; | ||
1246 | bdi->congested_fn = btrfs_congested_fn; | ||
1247 | bdi->congested_data = info; | ||
1248 | return 0; | ||
1249 | } | ||
1250 | |||
1251 | static int bio_ready_for_csum(struct bio *bio) | ||
1252 | { | ||
1253 | u64 length = 0; | ||
1254 | u64 buf_len = 0; | ||
1255 | u64 start = 0; | ||
1256 | struct page *page; | ||
1257 | struct extent_io_tree *io_tree = NULL; | ||
1258 | struct btrfs_fs_info *info = NULL; | ||
1259 | struct bio_vec *bvec; | ||
1260 | int i; | ||
1261 | int ret; | ||
1262 | |||
1263 | bio_for_each_segment(bvec, bio, i) { | ||
1264 | page = bvec->bv_page; | ||
1265 | if (page->private == EXTENT_PAGE_PRIVATE) { | ||
1266 | length += bvec->bv_len; | ||
1267 | continue; | ||
1268 | } | ||
1269 | if (!page->private) { | ||
1270 | length += bvec->bv_len; | ||
1271 | continue; | ||
1272 | } | ||
1273 | length = bvec->bv_len; | ||
1274 | buf_len = page->private >> 2; | ||
1275 | start = page_offset(page) + bvec->bv_offset; | ||
1276 | io_tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
1277 | info = BTRFS_I(page->mapping->host)->root->fs_info; | ||
1278 | } | ||
1279 | /* are we fully contained in this bio? */ | ||
1280 | if (buf_len <= length) | ||
1281 | return 1; | ||
1282 | |||
1283 | ret = extent_range_uptodate(io_tree, start + length, | ||
1284 | start + buf_len - 1); | ||
1285 | if (ret == 1) | ||
1286 | return ret; | ||
1287 | return ret; | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * called by the kthread helper functions to finally call the bio end_io | ||
1292 | * functions. This is where read checksum verification actually happens | ||
1293 | */ | ||
1294 | static void end_workqueue_fn(struct btrfs_work *work) | ||
1295 | { | ||
1296 | struct bio *bio; | ||
1297 | struct end_io_wq *end_io_wq; | ||
1298 | struct btrfs_fs_info *fs_info; | ||
1299 | int error; | ||
1300 | |||
1301 | end_io_wq = container_of(work, struct end_io_wq, work); | ||
1302 | bio = end_io_wq->bio; | ||
1303 | fs_info = end_io_wq->info; | ||
1304 | |||
1305 | /* metadata bio reads are special because the whole tree block must | ||
1306 | * be checksummed at once. This makes sure the entire block is in | ||
1307 | * ram and up to date before trying to verify things. For | ||
1308 | * blocksize <= pagesize, it is basically a noop | ||
1309 | */ | ||
1310 | if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && | ||
1311 | !bio_ready_for_csum(bio)) { | ||
1312 | btrfs_queue_worker(&fs_info->endio_meta_workers, | ||
1313 | &end_io_wq->work); | ||
1314 | return; | ||
1315 | } | ||
1316 | error = end_io_wq->error; | ||
1317 | bio->bi_private = end_io_wq->private; | ||
1318 | bio->bi_end_io = end_io_wq->end_io; | ||
1319 | kfree(end_io_wq); | ||
1320 | bio_endio(bio, error); | ||
1321 | } | ||
1322 | |||
1323 | static int cleaner_kthread(void *arg) | ||
1324 | { | ||
1325 | struct btrfs_root *root = arg; | ||
1326 | |||
1327 | do { | ||
1328 | smp_mb(); | ||
1329 | if (root->fs_info->closing) | ||
1330 | break; | ||
1331 | |||
1332 | vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); | ||
1333 | mutex_lock(&root->fs_info->cleaner_mutex); | ||
1334 | btrfs_clean_old_snapshots(root); | ||
1335 | mutex_unlock(&root->fs_info->cleaner_mutex); | ||
1336 | |||
1337 | if (freezing(current)) { | ||
1338 | refrigerator(); | ||
1339 | } else { | ||
1340 | smp_mb(); | ||
1341 | if (root->fs_info->closing) | ||
1342 | break; | ||
1343 | set_current_state(TASK_INTERRUPTIBLE); | ||
1344 | schedule(); | ||
1345 | __set_current_state(TASK_RUNNING); | ||
1346 | } | ||
1347 | } while (!kthread_should_stop()); | ||
1348 | return 0; | ||
1349 | } | ||
1350 | |||
1351 | static int transaction_kthread(void *arg) | ||
1352 | { | ||
1353 | struct btrfs_root *root = arg; | ||
1354 | struct btrfs_trans_handle *trans; | ||
1355 | struct btrfs_transaction *cur; | ||
1356 | unsigned long now; | ||
1357 | unsigned long delay; | ||
1358 | int ret; | ||
1359 | |||
1360 | do { | ||
1361 | smp_mb(); | ||
1362 | if (root->fs_info->closing) | ||
1363 | break; | ||
1364 | |||
1365 | delay = HZ * 30; | ||
1366 | vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); | ||
1367 | mutex_lock(&root->fs_info->transaction_kthread_mutex); | ||
1368 | |||
1369 | if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { | ||
1370 | printk(KERN_INFO "btrfs: total reference cache " | ||
1371 | "size %llu\n", | ||
1372 | root->fs_info->total_ref_cache_size); | ||
1373 | } | ||
1374 | |||
1375 | mutex_lock(&root->fs_info->trans_mutex); | ||
1376 | cur = root->fs_info->running_transaction; | ||
1377 | if (!cur) { | ||
1378 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1379 | goto sleep; | ||
1380 | } | ||
1381 | |||
1382 | now = get_seconds(); | ||
1383 | if (now < cur->start_time || now - cur->start_time < 30) { | ||
1384 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1385 | delay = HZ * 5; | ||
1386 | goto sleep; | ||
1387 | } | ||
1388 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1389 | trans = btrfs_start_transaction(root, 1); | ||
1390 | ret = btrfs_commit_transaction(trans, root); | ||
1391 | sleep: | ||
1392 | wake_up_process(root->fs_info->cleaner_kthread); | ||
1393 | mutex_unlock(&root->fs_info->transaction_kthread_mutex); | ||
1394 | |||
1395 | if (freezing(current)) { | ||
1396 | refrigerator(); | ||
1397 | } else { | ||
1398 | if (root->fs_info->closing) | ||
1399 | break; | ||
1400 | set_current_state(TASK_INTERRUPTIBLE); | ||
1401 | schedule_timeout(delay); | ||
1402 | __set_current_state(TASK_RUNNING); | ||
1403 | } | ||
1404 | } while (!kthread_should_stop()); | ||
1405 | return 0; | ||
1406 | } | ||
1407 | |||
1408 | struct btrfs_root *open_ctree(struct super_block *sb, | ||
1409 | struct btrfs_fs_devices *fs_devices, | ||
1410 | char *options) | ||
1411 | { | ||
1412 | u32 sectorsize; | ||
1413 | u32 nodesize; | ||
1414 | u32 leafsize; | ||
1415 | u32 blocksize; | ||
1416 | u32 stripesize; | ||
1417 | u64 generation; | ||
1418 | u64 features; | ||
1419 | struct btrfs_key location; | ||
1420 | struct buffer_head *bh; | ||
1421 | struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), | ||
1422 | GFP_NOFS); | ||
1423 | struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), | ||
1424 | GFP_NOFS); | ||
1425 | struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), | ||
1426 | GFP_NOFS); | ||
1427 | struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), | ||
1428 | GFP_NOFS); | ||
1429 | struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), | ||
1430 | GFP_NOFS); | ||
1431 | struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), | ||
1432 | GFP_NOFS); | ||
1433 | struct btrfs_root *log_tree_root; | ||
1434 | |||
1435 | int ret; | ||
1436 | int err = -EINVAL; | ||
1437 | |||
1438 | struct btrfs_super_block *disk_super; | ||
1439 | |||
1440 | if (!extent_root || !tree_root || !fs_info || | ||
1441 | !chunk_root || !dev_root || !csum_root) { | ||
1442 | err = -ENOMEM; | ||
1443 | goto fail; | ||
1444 | } | ||
1445 | INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); | ||
1446 | INIT_LIST_HEAD(&fs_info->trans_list); | ||
1447 | INIT_LIST_HEAD(&fs_info->dead_roots); | ||
1448 | INIT_LIST_HEAD(&fs_info->hashers); | ||
1449 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); | ||
1450 | spin_lock_init(&fs_info->hash_lock); | ||
1451 | spin_lock_init(&fs_info->delalloc_lock); | ||
1452 | spin_lock_init(&fs_info->new_trans_lock); | ||
1453 | spin_lock_init(&fs_info->ref_cache_lock); | ||
1454 | |||
1455 | init_completion(&fs_info->kobj_unregister); | ||
1456 | fs_info->tree_root = tree_root; | ||
1457 | fs_info->extent_root = extent_root; | ||
1458 | fs_info->csum_root = csum_root; | ||
1459 | fs_info->chunk_root = chunk_root; | ||
1460 | fs_info->dev_root = dev_root; | ||
1461 | fs_info->fs_devices = fs_devices; | ||
1462 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); | ||
1463 | INIT_LIST_HEAD(&fs_info->space_info); | ||
1464 | btrfs_mapping_init(&fs_info->mapping_tree); | ||
1465 | atomic_set(&fs_info->nr_async_submits, 0); | ||
1466 | atomic_set(&fs_info->async_delalloc_pages, 0); | ||
1467 | atomic_set(&fs_info->async_submit_draining, 0); | ||
1468 | atomic_set(&fs_info->nr_async_bios, 0); | ||
1469 | atomic_set(&fs_info->throttles, 0); | ||
1470 | atomic_set(&fs_info->throttle_gen, 0); | ||
1471 | fs_info->sb = sb; | ||
1472 | fs_info->max_extent = (u64)-1; | ||
1473 | fs_info->max_inline = 8192 * 1024; | ||
1474 | setup_bdi(fs_info, &fs_info->bdi); | ||
1475 | fs_info->btree_inode = new_inode(sb); | ||
1476 | fs_info->btree_inode->i_ino = 1; | ||
1477 | fs_info->btree_inode->i_nlink = 1; | ||
1478 | |||
1479 | fs_info->thread_pool_size = min_t(unsigned long, | ||
1480 | num_online_cpus() + 2, 8); | ||
1481 | |||
1482 | INIT_LIST_HEAD(&fs_info->ordered_extents); | ||
1483 | spin_lock_init(&fs_info->ordered_extent_lock); | ||
1484 | |||
1485 | sb->s_blocksize = 4096; | ||
1486 | sb->s_blocksize_bits = blksize_bits(4096); | ||
1487 | |||
1488 | /* | ||
1489 | * we set the i_size on the btree inode to the max possible int. | ||
1490 | * the real end of the address space is determined by all of | ||
1491 | * the devices in the system | ||
1492 | */ | ||
1493 | fs_info->btree_inode->i_size = OFFSET_MAX; | ||
1494 | fs_info->btree_inode->i_mapping->a_ops = &btree_aops; | ||
1495 | fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; | ||
1496 | |||
1497 | extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, | ||
1498 | fs_info->btree_inode->i_mapping, | ||
1499 | GFP_NOFS); | ||
1500 | extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, | ||
1501 | GFP_NOFS); | ||
1502 | |||
1503 | BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; | ||
1504 | |||
1505 | spin_lock_init(&fs_info->block_group_cache_lock); | ||
1506 | fs_info->block_group_cache_tree.rb_node = NULL; | ||
1507 | |||
1508 | extent_io_tree_init(&fs_info->pinned_extents, | ||
1509 | fs_info->btree_inode->i_mapping, GFP_NOFS); | ||
1510 | extent_io_tree_init(&fs_info->pending_del, | ||
1511 | fs_info->btree_inode->i_mapping, GFP_NOFS); | ||
1512 | extent_io_tree_init(&fs_info->extent_ins, | ||
1513 | fs_info->btree_inode->i_mapping, GFP_NOFS); | ||
1514 | fs_info->do_barriers = 1; | ||
1515 | |||
1516 | INIT_LIST_HEAD(&fs_info->dead_reloc_roots); | ||
1517 | btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); | ||
1518 | btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); | ||
1519 | |||
1520 | BTRFS_I(fs_info->btree_inode)->root = tree_root; | ||
1521 | memset(&BTRFS_I(fs_info->btree_inode)->location, 0, | ||
1522 | sizeof(struct btrfs_key)); | ||
1523 | insert_inode_hash(fs_info->btree_inode); | ||
1524 | |||
1525 | mutex_init(&fs_info->trans_mutex); | ||
1526 | mutex_init(&fs_info->tree_log_mutex); | ||
1527 | mutex_init(&fs_info->drop_mutex); | ||
1528 | mutex_init(&fs_info->extent_ins_mutex); | ||
1529 | mutex_init(&fs_info->pinned_mutex); | ||
1530 | mutex_init(&fs_info->chunk_mutex); | ||
1531 | mutex_init(&fs_info->transaction_kthread_mutex); | ||
1532 | mutex_init(&fs_info->cleaner_mutex); | ||
1533 | mutex_init(&fs_info->volume_mutex); | ||
1534 | mutex_init(&fs_info->tree_reloc_mutex); | ||
1535 | init_waitqueue_head(&fs_info->transaction_throttle); | ||
1536 | init_waitqueue_head(&fs_info->transaction_wait); | ||
1537 | init_waitqueue_head(&fs_info->async_submit_wait); | ||
1538 | init_waitqueue_head(&fs_info->tree_log_wait); | ||
1539 | atomic_set(&fs_info->tree_log_commit, 0); | ||
1540 | atomic_set(&fs_info->tree_log_writers, 0); | ||
1541 | fs_info->tree_log_transid = 0; | ||
1542 | |||
1543 | __setup_root(4096, 4096, 4096, 4096, tree_root, | ||
1544 | fs_info, BTRFS_ROOT_TREE_OBJECTID); | ||
1545 | |||
1546 | |||
1547 | bh = btrfs_read_dev_super(fs_devices->latest_bdev); | ||
1548 | if (!bh) | ||
1549 | goto fail_iput; | ||
1550 | |||
1551 | memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); | ||
1552 | memcpy(&fs_info->super_for_commit, &fs_info->super_copy, | ||
1553 | sizeof(fs_info->super_for_commit)); | ||
1554 | brelse(bh); | ||
1555 | |||
1556 | memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); | ||
1557 | |||
1558 | disk_super = &fs_info->super_copy; | ||
1559 | if (!btrfs_super_root(disk_super)) | ||
1560 | goto fail_iput; | ||
1561 | |||
1562 | ret = btrfs_parse_options(tree_root, options); | ||
1563 | if (ret) { | ||
1564 | err = ret; | ||
1565 | goto fail_iput; | ||
1566 | } | ||
1567 | |||
1568 | features = btrfs_super_incompat_flags(disk_super) & | ||
1569 | ~BTRFS_FEATURE_INCOMPAT_SUPP; | ||
1570 | if (features) { | ||
1571 | printk(KERN_ERR "BTRFS: couldn't mount because of " | ||
1572 | "unsupported optional features (%Lx).\n", | ||
1573 | features); | ||
1574 | err = -EINVAL; | ||
1575 | goto fail_iput; | ||
1576 | } | ||
1577 | |||
1578 | features = btrfs_super_compat_ro_flags(disk_super) & | ||
1579 | ~BTRFS_FEATURE_COMPAT_RO_SUPP; | ||
1580 | if (!(sb->s_flags & MS_RDONLY) && features) { | ||
1581 | printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " | ||
1582 | "unsupported option features (%Lx).\n", | ||
1583 | features); | ||
1584 | err = -EINVAL; | ||
1585 | goto fail_iput; | ||
1586 | } | ||
1587 | |||
1588 | /* | ||
1589 | * we need to start all the end_io workers up front because the | ||
1590 | * queue work function gets called at interrupt time, and so it | ||
1591 | * cannot dynamically grow. | ||
1592 | */ | ||
1593 | btrfs_init_workers(&fs_info->workers, "worker", | ||
1594 | fs_info->thread_pool_size); | ||
1595 | |||
1596 | btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", | ||
1597 | fs_info->thread_pool_size); | ||
1598 | |||
1599 | btrfs_init_workers(&fs_info->submit_workers, "submit", | ||
1600 | min_t(u64, fs_devices->num_devices, | ||
1601 | fs_info->thread_pool_size)); | ||
1602 | |||
1603 | /* a higher idle thresh on the submit workers makes it much more | ||
1604 | * likely that bios will be send down in a sane order to the | ||
1605 | * devices | ||
1606 | */ | ||
1607 | fs_info->submit_workers.idle_thresh = 64; | ||
1608 | |||
1609 | fs_info->workers.idle_thresh = 16; | ||
1610 | fs_info->workers.ordered = 1; | ||
1611 | |||
1612 | fs_info->delalloc_workers.idle_thresh = 2; | ||
1613 | fs_info->delalloc_workers.ordered = 1; | ||
1614 | |||
1615 | btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); | ||
1616 | btrfs_init_workers(&fs_info->endio_workers, "endio", | ||
1617 | fs_info->thread_pool_size); | ||
1618 | btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", | ||
1619 | fs_info->thread_pool_size); | ||
1620 | btrfs_init_workers(&fs_info->endio_meta_write_workers, | ||
1621 | "endio-meta-write", fs_info->thread_pool_size); | ||
1622 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | ||
1623 | fs_info->thread_pool_size); | ||
1624 | |||
1625 | /* | ||
1626 | * endios are largely parallel and should have a very | ||
1627 | * low idle thresh | ||
1628 | */ | ||
1629 | fs_info->endio_workers.idle_thresh = 4; | ||
1630 | fs_info->endio_write_workers.idle_thresh = 64; | ||
1631 | fs_info->endio_meta_write_workers.idle_thresh = 64; | ||
1632 | |||
1633 | btrfs_start_workers(&fs_info->workers, 1); | ||
1634 | btrfs_start_workers(&fs_info->submit_workers, 1); | ||
1635 | btrfs_start_workers(&fs_info->delalloc_workers, 1); | ||
1636 | btrfs_start_workers(&fs_info->fixup_workers, 1); | ||
1637 | btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); | ||
1638 | btrfs_start_workers(&fs_info->endio_meta_workers, | ||
1639 | fs_info->thread_pool_size); | ||
1640 | btrfs_start_workers(&fs_info->endio_meta_write_workers, | ||
1641 | fs_info->thread_pool_size); | ||
1642 | btrfs_start_workers(&fs_info->endio_write_workers, | ||
1643 | fs_info->thread_pool_size); | ||
1644 | |||
1645 | fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); | ||
1646 | fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, | ||
1647 | 4 * 1024 * 1024 / PAGE_CACHE_SIZE); | ||
1648 | |||
1649 | nodesize = btrfs_super_nodesize(disk_super); | ||
1650 | leafsize = btrfs_super_leafsize(disk_super); | ||
1651 | sectorsize = btrfs_super_sectorsize(disk_super); | ||
1652 | stripesize = btrfs_super_stripesize(disk_super); | ||
1653 | tree_root->nodesize = nodesize; | ||
1654 | tree_root->leafsize = leafsize; | ||
1655 | tree_root->sectorsize = sectorsize; | ||
1656 | tree_root->stripesize = stripesize; | ||
1657 | |||
1658 | sb->s_blocksize = sectorsize; | ||
1659 | sb->s_blocksize_bits = blksize_bits(sectorsize); | ||
1660 | |||
1661 | if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, | ||
1662 | sizeof(disk_super->magic))) { | ||
1663 | printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); | ||
1664 | goto fail_sb_buffer; | ||
1665 | } | ||
1666 | |||
1667 | mutex_lock(&fs_info->chunk_mutex); | ||
1668 | ret = btrfs_read_sys_array(tree_root); | ||
1669 | mutex_unlock(&fs_info->chunk_mutex); | ||
1670 | if (ret) { | ||
1671 | printk(KERN_WARNING "btrfs: failed to read the system " | ||
1672 | "array on %s\n", sb->s_id); | ||
1673 | goto fail_sys_array; | ||
1674 | } | ||
1675 | |||
1676 | blocksize = btrfs_level_size(tree_root, | ||
1677 | btrfs_super_chunk_root_level(disk_super)); | ||
1678 | generation = btrfs_super_chunk_root_generation(disk_super); | ||
1679 | |||
1680 | __setup_root(nodesize, leafsize, sectorsize, stripesize, | ||
1681 | chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); | ||
1682 | |||
1683 | chunk_root->node = read_tree_block(chunk_root, | ||
1684 | btrfs_super_chunk_root(disk_super), | ||
1685 | blocksize, generation); | ||
1686 | BUG_ON(!chunk_root->node); | ||
1687 | |||
1688 | read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, | ||
1689 | (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), | ||
1690 | BTRFS_UUID_SIZE); | ||
1691 | |||
1692 | mutex_lock(&fs_info->chunk_mutex); | ||
1693 | ret = btrfs_read_chunk_tree(chunk_root); | ||
1694 | mutex_unlock(&fs_info->chunk_mutex); | ||
1695 | if (ret) { | ||
1696 | printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", | ||
1697 | sb->s_id); | ||
1698 | goto fail_chunk_root; | ||
1699 | } | ||
1700 | |||
1701 | btrfs_close_extra_devices(fs_devices); | ||
1702 | |||
1703 | blocksize = btrfs_level_size(tree_root, | ||
1704 | btrfs_super_root_level(disk_super)); | ||
1705 | generation = btrfs_super_generation(disk_super); | ||
1706 | |||
1707 | tree_root->node = read_tree_block(tree_root, | ||
1708 | btrfs_super_root(disk_super), | ||
1709 | blocksize, generation); | ||
1710 | if (!tree_root->node) | ||
1711 | goto fail_chunk_root; | ||
1712 | |||
1713 | |||
1714 | ret = find_and_setup_root(tree_root, fs_info, | ||
1715 | BTRFS_EXTENT_TREE_OBJECTID, extent_root); | ||
1716 | if (ret) | ||
1717 | goto fail_tree_root; | ||
1718 | extent_root->track_dirty = 1; | ||
1719 | |||
1720 | ret = find_and_setup_root(tree_root, fs_info, | ||
1721 | BTRFS_DEV_TREE_OBJECTID, dev_root); | ||
1722 | dev_root->track_dirty = 1; | ||
1723 | |||
1724 | if (ret) | ||
1725 | goto fail_extent_root; | ||
1726 | |||
1727 | ret = find_and_setup_root(tree_root, fs_info, | ||
1728 | BTRFS_CSUM_TREE_OBJECTID, csum_root); | ||
1729 | if (ret) | ||
1730 | goto fail_extent_root; | ||
1731 | |||
1732 | csum_root->track_dirty = 1; | ||
1733 | |||
1734 | btrfs_read_block_groups(extent_root); | ||
1735 | |||
1736 | fs_info->generation = generation; | ||
1737 | fs_info->last_trans_committed = generation; | ||
1738 | fs_info->data_alloc_profile = (u64)-1; | ||
1739 | fs_info->metadata_alloc_profile = (u64)-1; | ||
1740 | fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; | ||
1741 | fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, | ||
1742 | "btrfs-cleaner"); | ||
1743 | if (!fs_info->cleaner_kthread) | ||
1744 | goto fail_csum_root; | ||
1745 | |||
1746 | fs_info->transaction_kthread = kthread_run(transaction_kthread, | ||
1747 | tree_root, | ||
1748 | "btrfs-transaction"); | ||
1749 | if (!fs_info->transaction_kthread) | ||
1750 | goto fail_cleaner; | ||
1751 | |||
1752 | if (btrfs_super_log_root(disk_super) != 0) { | ||
1753 | u64 bytenr = btrfs_super_log_root(disk_super); | ||
1754 | |||
1755 | if (fs_devices->rw_devices == 0) { | ||
1756 | printk(KERN_WARNING "Btrfs log replay required " | ||
1757 | "on RO media\n"); | ||
1758 | err = -EIO; | ||
1759 | goto fail_trans_kthread; | ||
1760 | } | ||
1761 | blocksize = | ||
1762 | btrfs_level_size(tree_root, | ||
1763 | btrfs_super_log_root_level(disk_super)); | ||
1764 | |||
1765 | log_tree_root = kzalloc(sizeof(struct btrfs_root), | ||
1766 | GFP_NOFS); | ||
1767 | |||
1768 | __setup_root(nodesize, leafsize, sectorsize, stripesize, | ||
1769 | log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); | ||
1770 | |||
1771 | log_tree_root->node = read_tree_block(tree_root, bytenr, | ||
1772 | blocksize, | ||
1773 | generation + 1); | ||
1774 | ret = btrfs_recover_log_trees(log_tree_root); | ||
1775 | BUG_ON(ret); | ||
1776 | |||
1777 | if (sb->s_flags & MS_RDONLY) { | ||
1778 | ret = btrfs_commit_super(tree_root); | ||
1779 | BUG_ON(ret); | ||
1780 | } | ||
1781 | } | ||
1782 | |||
1783 | if (!(sb->s_flags & MS_RDONLY)) { | ||
1784 | ret = btrfs_cleanup_reloc_trees(tree_root); | ||
1785 | BUG_ON(ret); | ||
1786 | } | ||
1787 | |||
1788 | location.objectid = BTRFS_FS_TREE_OBJECTID; | ||
1789 | location.type = BTRFS_ROOT_ITEM_KEY; | ||
1790 | location.offset = (u64)-1; | ||
1791 | |||
1792 | fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); | ||
1793 | if (!fs_info->fs_root) | ||
1794 | goto fail_trans_kthread; | ||
1795 | return tree_root; | ||
1796 | |||
1797 | fail_trans_kthread: | ||
1798 | kthread_stop(fs_info->transaction_kthread); | ||
1799 | fail_cleaner: | ||
1800 | kthread_stop(fs_info->cleaner_kthread); | ||
1801 | |||
1802 | /* | ||
1803 | * make sure we're done with the btree inode before we stop our | ||
1804 | * kthreads | ||
1805 | */ | ||
1806 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); | ||
1807 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
1808 | |||
1809 | fail_csum_root: | ||
1810 | free_extent_buffer(csum_root->node); | ||
1811 | fail_extent_root: | ||
1812 | free_extent_buffer(extent_root->node); | ||
1813 | fail_tree_root: | ||
1814 | free_extent_buffer(tree_root->node); | ||
1815 | fail_chunk_root: | ||
1816 | free_extent_buffer(chunk_root->node); | ||
1817 | fail_sys_array: | ||
1818 | free_extent_buffer(dev_root->node); | ||
1819 | fail_sb_buffer: | ||
1820 | btrfs_stop_workers(&fs_info->fixup_workers); | ||
1821 | btrfs_stop_workers(&fs_info->delalloc_workers); | ||
1822 | btrfs_stop_workers(&fs_info->workers); | ||
1823 | btrfs_stop_workers(&fs_info->endio_workers); | ||
1824 | btrfs_stop_workers(&fs_info->endio_meta_workers); | ||
1825 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | ||
1826 | btrfs_stop_workers(&fs_info->endio_write_workers); | ||
1827 | btrfs_stop_workers(&fs_info->submit_workers); | ||
1828 | fail_iput: | ||
1829 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
1830 | iput(fs_info->btree_inode); | ||
1831 | fail: | ||
1832 | btrfs_close_devices(fs_info->fs_devices); | ||
1833 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | ||
1834 | |||
1835 | kfree(extent_root); | ||
1836 | kfree(tree_root); | ||
1837 | bdi_destroy(&fs_info->bdi); | ||
1838 | kfree(fs_info); | ||
1839 | kfree(chunk_root); | ||
1840 | kfree(dev_root); | ||
1841 | kfree(csum_root); | ||
1842 | return ERR_PTR(err); | ||
1843 | } | ||
1844 | |||
1845 | static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) | ||
1846 | { | ||
1847 | char b[BDEVNAME_SIZE]; | ||
1848 | |||
1849 | if (uptodate) { | ||
1850 | set_buffer_uptodate(bh); | ||
1851 | } else { | ||
1852 | if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { | ||
1853 | printk(KERN_WARNING "lost page write due to " | ||
1854 | "I/O error on %s\n", | ||
1855 | bdevname(bh->b_bdev, b)); | ||
1856 | } | ||
1857 | /* note, we dont' set_buffer_write_io_error because we have | ||
1858 | * our own ways of dealing with the IO errors | ||
1859 | */ | ||
1860 | clear_buffer_uptodate(bh); | ||
1861 | } | ||
1862 | unlock_buffer(bh); | ||
1863 | put_bh(bh); | ||
1864 | } | ||
1865 | |||
1866 | struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) | ||
1867 | { | ||
1868 | struct buffer_head *bh; | ||
1869 | struct buffer_head *latest = NULL; | ||
1870 | struct btrfs_super_block *super; | ||
1871 | int i; | ||
1872 | u64 transid = 0; | ||
1873 | u64 bytenr; | ||
1874 | |||
1875 | /* we would like to check all the supers, but that would make | ||
1876 | * a btrfs mount succeed after a mkfs from a different FS. | ||
1877 | * So, we need to add a special mount option to scan for | ||
1878 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead | ||
1879 | */ | ||
1880 | for (i = 0; i < 1; i++) { | ||
1881 | bytenr = btrfs_sb_offset(i); | ||
1882 | if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) | ||
1883 | break; | ||
1884 | bh = __bread(bdev, bytenr / 4096, 4096); | ||
1885 | if (!bh) | ||
1886 | continue; | ||
1887 | |||
1888 | super = (struct btrfs_super_block *)bh->b_data; | ||
1889 | if (btrfs_super_bytenr(super) != bytenr || | ||
1890 | strncmp((char *)(&super->magic), BTRFS_MAGIC, | ||
1891 | sizeof(super->magic))) { | ||
1892 | brelse(bh); | ||
1893 | continue; | ||
1894 | } | ||
1895 | |||
1896 | if (!latest || btrfs_super_generation(super) > transid) { | ||
1897 | brelse(latest); | ||
1898 | latest = bh; | ||
1899 | transid = btrfs_super_generation(super); | ||
1900 | } else { | ||
1901 | brelse(bh); | ||
1902 | } | ||
1903 | } | ||
1904 | return latest; | ||
1905 | } | ||
1906 | |||
1907 | static int write_dev_supers(struct btrfs_device *device, | ||
1908 | struct btrfs_super_block *sb, | ||
1909 | int do_barriers, int wait, int max_mirrors) | ||
1910 | { | ||
1911 | struct buffer_head *bh; | ||
1912 | int i; | ||
1913 | int ret; | ||
1914 | int errors = 0; | ||
1915 | u32 crc; | ||
1916 | u64 bytenr; | ||
1917 | int last_barrier = 0; | ||
1918 | |||
1919 | if (max_mirrors == 0) | ||
1920 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; | ||
1921 | |||
1922 | /* make sure only the last submit_bh does a barrier */ | ||
1923 | if (do_barriers) { | ||
1924 | for (i = 0; i < max_mirrors; i++) { | ||
1925 | bytenr = btrfs_sb_offset(i); | ||
1926 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= | ||
1927 | device->total_bytes) | ||
1928 | break; | ||
1929 | last_barrier = i; | ||
1930 | } | ||
1931 | } | ||
1932 | |||
1933 | for (i = 0; i < max_mirrors; i++) { | ||
1934 | bytenr = btrfs_sb_offset(i); | ||
1935 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) | ||
1936 | break; | ||
1937 | |||
1938 | if (wait) { | ||
1939 | bh = __find_get_block(device->bdev, bytenr / 4096, | ||
1940 | BTRFS_SUPER_INFO_SIZE); | ||
1941 | BUG_ON(!bh); | ||
1942 | brelse(bh); | ||
1943 | wait_on_buffer(bh); | ||
1944 | if (buffer_uptodate(bh)) { | ||
1945 | brelse(bh); | ||
1946 | continue; | ||
1947 | } | ||
1948 | } else { | ||
1949 | btrfs_set_super_bytenr(sb, bytenr); | ||
1950 | |||
1951 | crc = ~(u32)0; | ||
1952 | crc = btrfs_csum_data(NULL, (char *)sb + | ||
1953 | BTRFS_CSUM_SIZE, crc, | ||
1954 | BTRFS_SUPER_INFO_SIZE - | ||
1955 | BTRFS_CSUM_SIZE); | ||
1956 | btrfs_csum_final(crc, sb->csum); | ||
1957 | |||
1958 | bh = __getblk(device->bdev, bytenr / 4096, | ||
1959 | BTRFS_SUPER_INFO_SIZE); | ||
1960 | memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); | ||
1961 | |||
1962 | set_buffer_uptodate(bh); | ||
1963 | get_bh(bh); | ||
1964 | lock_buffer(bh); | ||
1965 | bh->b_end_io = btrfs_end_buffer_write_sync; | ||
1966 | } | ||
1967 | |||
1968 | if (i == last_barrier && do_barriers && device->barriers) { | ||
1969 | ret = submit_bh(WRITE_BARRIER, bh); | ||
1970 | if (ret == -EOPNOTSUPP) { | ||
1971 | printk("btrfs: disabling barriers on dev %s\n", | ||
1972 | device->name); | ||
1973 | set_buffer_uptodate(bh); | ||
1974 | device->barriers = 0; | ||
1975 | get_bh(bh); | ||
1976 | lock_buffer(bh); | ||
1977 | ret = submit_bh(WRITE, bh); | ||
1978 | } | ||
1979 | } else { | ||
1980 | ret = submit_bh(WRITE, bh); | ||
1981 | } | ||
1982 | |||
1983 | if (!ret && wait) { | ||
1984 | wait_on_buffer(bh); | ||
1985 | if (!buffer_uptodate(bh)) | ||
1986 | errors++; | ||
1987 | } else if (ret) { | ||
1988 | errors++; | ||
1989 | } | ||
1990 | if (wait) | ||
1991 | brelse(bh); | ||
1992 | } | ||
1993 | return errors < i ? 0 : -1; | ||
1994 | } | ||
1995 | |||
1996 | int write_all_supers(struct btrfs_root *root, int max_mirrors) | ||
1997 | { | ||
1998 | struct list_head *cur; | ||
1999 | struct list_head *head = &root->fs_info->fs_devices->devices; | ||
2000 | struct btrfs_device *dev; | ||
2001 | struct btrfs_super_block *sb; | ||
2002 | struct btrfs_dev_item *dev_item; | ||
2003 | int ret; | ||
2004 | int do_barriers; | ||
2005 | int max_errors; | ||
2006 | int total_errors = 0; | ||
2007 | u64 flags; | ||
2008 | |||
2009 | max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; | ||
2010 | do_barriers = !btrfs_test_opt(root, NOBARRIER); | ||
2011 | |||
2012 | sb = &root->fs_info->super_for_commit; | ||
2013 | dev_item = &sb->dev_item; | ||
2014 | list_for_each(cur, head) { | ||
2015 | dev = list_entry(cur, struct btrfs_device, dev_list); | ||
2016 | if (!dev->bdev) { | ||
2017 | total_errors++; | ||
2018 | continue; | ||
2019 | } | ||
2020 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2021 | continue; | ||
2022 | |||
2023 | btrfs_set_stack_device_generation(dev_item, 0); | ||
2024 | btrfs_set_stack_device_type(dev_item, dev->type); | ||
2025 | btrfs_set_stack_device_id(dev_item, dev->devid); | ||
2026 | btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); | ||
2027 | btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); | ||
2028 | btrfs_set_stack_device_io_align(dev_item, dev->io_align); | ||
2029 | btrfs_set_stack_device_io_width(dev_item, dev->io_width); | ||
2030 | btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); | ||
2031 | memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); | ||
2032 | memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); | ||
2033 | |||
2034 | flags = btrfs_super_flags(sb); | ||
2035 | btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); | ||
2036 | |||
2037 | ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); | ||
2038 | if (ret) | ||
2039 | total_errors++; | ||
2040 | } | ||
2041 | if (total_errors > max_errors) { | ||
2042 | printk(KERN_ERR "btrfs: %d errors while writing supers\n", | ||
2043 | total_errors); | ||
2044 | BUG(); | ||
2045 | } | ||
2046 | |||
2047 | total_errors = 0; | ||
2048 | list_for_each(cur, head) { | ||
2049 | dev = list_entry(cur, struct btrfs_device, dev_list); | ||
2050 | if (!dev->bdev) | ||
2051 | continue; | ||
2052 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2053 | continue; | ||
2054 | |||
2055 | ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); | ||
2056 | if (ret) | ||
2057 | total_errors++; | ||
2058 | } | ||
2059 | if (total_errors > max_errors) { | ||
2060 | printk(KERN_ERR "btrfs: %d errors while writing supers\n", | ||
2061 | total_errors); | ||
2062 | BUG(); | ||
2063 | } | ||
2064 | return 0; | ||
2065 | } | ||
2066 | |||
2067 | int write_ctree_super(struct btrfs_trans_handle *trans, | ||
2068 | struct btrfs_root *root, int max_mirrors) | ||
2069 | { | ||
2070 | int ret; | ||
2071 | |||
2072 | ret = write_all_supers(root, max_mirrors); | ||
2073 | return ret; | ||
2074 | } | ||
2075 | |||
2076 | int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) | ||
2077 | { | ||
2078 | radix_tree_delete(&fs_info->fs_roots_radix, | ||
2079 | (unsigned long)root->root_key.objectid); | ||
2080 | if (root->anon_super.s_dev) { | ||
2081 | down_write(&root->anon_super.s_umount); | ||
2082 | kill_anon_super(&root->anon_super); | ||
2083 | } | ||
2084 | if (root->node) | ||
2085 | free_extent_buffer(root->node); | ||
2086 | if (root->commit_root) | ||
2087 | free_extent_buffer(root->commit_root); | ||
2088 | kfree(root->name); | ||
2089 | kfree(root); | ||
2090 | return 0; | ||
2091 | } | ||
2092 | |||
2093 | static int del_fs_roots(struct btrfs_fs_info *fs_info) | ||
2094 | { | ||
2095 | int ret; | ||
2096 | struct btrfs_root *gang[8]; | ||
2097 | int i; | ||
2098 | |||
2099 | while (1) { | ||
2100 | ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, | ||
2101 | (void **)gang, 0, | ||
2102 | ARRAY_SIZE(gang)); | ||
2103 | if (!ret) | ||
2104 | break; | ||
2105 | for (i = 0; i < ret; i++) | ||
2106 | btrfs_free_fs_root(fs_info, gang[i]); | ||
2107 | } | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2111 | int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) | ||
2112 | { | ||
2113 | u64 root_objectid = 0; | ||
2114 | struct btrfs_root *gang[8]; | ||
2115 | int i; | ||
2116 | int ret; | ||
2117 | |||
2118 | while (1) { | ||
2119 | ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, | ||
2120 | (void **)gang, root_objectid, | ||
2121 | ARRAY_SIZE(gang)); | ||
2122 | if (!ret) | ||
2123 | break; | ||
2124 | for (i = 0; i < ret; i++) { | ||
2125 | root_objectid = gang[i]->root_key.objectid; | ||
2126 | ret = btrfs_find_dead_roots(fs_info->tree_root, | ||
2127 | root_objectid, gang[i]); | ||
2128 | BUG_ON(ret); | ||
2129 | btrfs_orphan_cleanup(gang[i]); | ||
2130 | } | ||
2131 | root_objectid++; | ||
2132 | } | ||
2133 | return 0; | ||
2134 | } | ||
2135 | |||
2136 | int btrfs_commit_super(struct btrfs_root *root) | ||
2137 | { | ||
2138 | struct btrfs_trans_handle *trans; | ||
2139 | int ret; | ||
2140 | |||
2141 | mutex_lock(&root->fs_info->cleaner_mutex); | ||
2142 | btrfs_clean_old_snapshots(root); | ||
2143 | mutex_unlock(&root->fs_info->cleaner_mutex); | ||
2144 | trans = btrfs_start_transaction(root, 1); | ||
2145 | ret = btrfs_commit_transaction(trans, root); | ||
2146 | BUG_ON(ret); | ||
2147 | /* run commit again to drop the original snapshot */ | ||
2148 | trans = btrfs_start_transaction(root, 1); | ||
2149 | btrfs_commit_transaction(trans, root); | ||
2150 | ret = btrfs_write_and_wait_transaction(NULL, root); | ||
2151 | BUG_ON(ret); | ||
2152 | |||
2153 | ret = write_ctree_super(NULL, root, 0); | ||
2154 | return ret; | ||
2155 | } | ||
2156 | |||
2157 | int close_ctree(struct btrfs_root *root) | ||
2158 | { | ||
2159 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2160 | int ret; | ||
2161 | |||
2162 | fs_info->closing = 1; | ||
2163 | smp_mb(); | ||
2164 | |||
2165 | kthread_stop(root->fs_info->transaction_kthread); | ||
2166 | kthread_stop(root->fs_info->cleaner_kthread); | ||
2167 | |||
2168 | if (!(fs_info->sb->s_flags & MS_RDONLY)) { | ||
2169 | ret = btrfs_commit_super(root); | ||
2170 | if (ret) | ||
2171 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); | ||
2172 | } | ||
2173 | |||
2174 | if (fs_info->delalloc_bytes) { | ||
2175 | printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", | ||
2176 | fs_info->delalloc_bytes); | ||
2177 | } | ||
2178 | if (fs_info->total_ref_cache_size) { | ||
2179 | printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", | ||
2180 | (unsigned long long)fs_info->total_ref_cache_size); | ||
2181 | } | ||
2182 | |||
2183 | if (fs_info->extent_root->node) | ||
2184 | free_extent_buffer(fs_info->extent_root->node); | ||
2185 | |||
2186 | if (fs_info->tree_root->node) | ||
2187 | free_extent_buffer(fs_info->tree_root->node); | ||
2188 | |||
2189 | if (root->fs_info->chunk_root->node) | ||
2190 | free_extent_buffer(root->fs_info->chunk_root->node); | ||
2191 | |||
2192 | if (root->fs_info->dev_root->node) | ||
2193 | free_extent_buffer(root->fs_info->dev_root->node); | ||
2194 | |||
2195 | if (root->fs_info->csum_root->node) | ||
2196 | free_extent_buffer(root->fs_info->csum_root->node); | ||
2197 | |||
2198 | btrfs_free_block_groups(root->fs_info); | ||
2199 | |||
2200 | del_fs_roots(fs_info); | ||
2201 | |||
2202 | iput(fs_info->btree_inode); | ||
2203 | |||
2204 | btrfs_stop_workers(&fs_info->fixup_workers); | ||
2205 | btrfs_stop_workers(&fs_info->delalloc_workers); | ||
2206 | btrfs_stop_workers(&fs_info->workers); | ||
2207 | btrfs_stop_workers(&fs_info->endio_workers); | ||
2208 | btrfs_stop_workers(&fs_info->endio_meta_workers); | ||
2209 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | ||
2210 | btrfs_stop_workers(&fs_info->endio_write_workers); | ||
2211 | btrfs_stop_workers(&fs_info->submit_workers); | ||
2212 | |||
2213 | #if 0 | ||
2214 | while (!list_empty(&fs_info->hashers)) { | ||
2215 | struct btrfs_hasher *hasher; | ||
2216 | hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, | ||
2217 | hashers); | ||
2218 | list_del(&hasher->hashers); | ||
2219 | crypto_free_hash(&fs_info->hash_tfm); | ||
2220 | kfree(hasher); | ||
2221 | } | ||
2222 | #endif | ||
2223 | btrfs_close_devices(fs_info->fs_devices); | ||
2224 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | ||
2225 | |||
2226 | bdi_destroy(&fs_info->bdi); | ||
2227 | |||
2228 | kfree(fs_info->extent_root); | ||
2229 | kfree(fs_info->tree_root); | ||
2230 | kfree(fs_info->chunk_root); | ||
2231 | kfree(fs_info->dev_root); | ||
2232 | kfree(fs_info->csum_root); | ||
2233 | return 0; | ||
2234 | } | ||
2235 | |||
2236 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) | ||
2237 | { | ||
2238 | int ret; | ||
2239 | struct inode *btree_inode = buf->first_page->mapping->host; | ||
2240 | |||
2241 | ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); | ||
2242 | if (!ret) | ||
2243 | return ret; | ||
2244 | |||
2245 | ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, | ||
2246 | parent_transid); | ||
2247 | return !ret; | ||
2248 | } | ||
2249 | |||
2250 | int btrfs_set_buffer_uptodate(struct extent_buffer *buf) | ||
2251 | { | ||
2252 | struct inode *btree_inode = buf->first_page->mapping->host; | ||
2253 | return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, | ||
2254 | buf); | ||
2255 | } | ||
2256 | |||
2257 | void btrfs_mark_buffer_dirty(struct extent_buffer *buf) | ||
2258 | { | ||
2259 | struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; | ||
2260 | u64 transid = btrfs_header_generation(buf); | ||
2261 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
2262 | |||
2263 | WARN_ON(!btrfs_tree_locked(buf)); | ||
2264 | if (transid != root->fs_info->generation) { | ||
2265 | printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " | ||
2266 | "found %llu running %llu\n", | ||
2267 | (unsigned long long)buf->start, | ||
2268 | (unsigned long long)transid, | ||
2269 | (unsigned long long)root->fs_info->generation); | ||
2270 | WARN_ON(1); | ||
2271 | } | ||
2272 | set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); | ||
2273 | } | ||
2274 | |||
2275 | void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) | ||
2276 | { | ||
2277 | /* | ||
2278 | * looks as though older kernels can get into trouble with | ||
2279 | * this code, they end up stuck in balance_dirty_pages forever | ||
2280 | */ | ||
2281 | struct extent_io_tree *tree; | ||
2282 | u64 num_dirty; | ||
2283 | u64 start = 0; | ||
2284 | unsigned long thresh = 32 * 1024 * 1024; | ||
2285 | tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; | ||
2286 | |||
2287 | if (current_is_pdflush() || current->flags & PF_MEMALLOC) | ||
2288 | return; | ||
2289 | |||
2290 | num_dirty = count_range_bits(tree, &start, (u64)-1, | ||
2291 | thresh, EXTENT_DIRTY); | ||
2292 | if (num_dirty > thresh) { | ||
2293 | balance_dirty_pages_ratelimited_nr( | ||
2294 | root->fs_info->btree_inode->i_mapping, 1); | ||
2295 | } | ||
2296 | return; | ||
2297 | } | ||
2298 | |||
2299 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) | ||
2300 | { | ||
2301 | struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; | ||
2302 | int ret; | ||
2303 | ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); | ||
2304 | if (ret == 0) | ||
2305 | buf->flags |= EXTENT_UPTODATE; | ||
2306 | return ret; | ||
2307 | } | ||
2308 | |||
2309 | int btree_lock_page_hook(struct page *page) | ||
2310 | { | ||
2311 | struct inode *inode = page->mapping->host; | ||
2312 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2313 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
2314 | struct extent_buffer *eb; | ||
2315 | unsigned long len; | ||
2316 | u64 bytenr = page_offset(page); | ||
2317 | |||
2318 | if (page->private == EXTENT_PAGE_PRIVATE) | ||
2319 | goto out; | ||
2320 | |||
2321 | len = page->private >> 2; | ||
2322 | eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); | ||
2323 | if (!eb) | ||
2324 | goto out; | ||
2325 | |||
2326 | btrfs_tree_lock(eb); | ||
2327 | spin_lock(&root->fs_info->hash_lock); | ||
2328 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); | ||
2329 | spin_unlock(&root->fs_info->hash_lock); | ||
2330 | btrfs_tree_unlock(eb); | ||
2331 | free_extent_buffer(eb); | ||
2332 | out: | ||
2333 | lock_page(page); | ||
2334 | return 0; | ||
2335 | } | ||
2336 | |||
2337 | static struct extent_io_ops btree_extent_io_ops = { | ||
2338 | .write_cache_pages_lock_hook = btree_lock_page_hook, | ||
2339 | .readpage_end_io_hook = btree_readpage_end_io_hook, | ||
2340 | .submit_bio_hook = btree_submit_bio_hook, | ||
2341 | /* note we're sharing with inode.c for the merge bio hook */ | ||
2342 | .merge_bio_hook = btrfs_merge_bio_hook, | ||
2343 | }; | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 000000000000..c0ff404c31b7 --- /dev/null +++ b/fs/btrfs/disk-io.h | |||
@@ -0,0 +1,102 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __DISKIO__ | ||
20 | #define __DISKIO__ | ||
21 | |||
22 | #define BTRFS_SUPER_INFO_OFFSET (64 * 1024) | ||
23 | #define BTRFS_SUPER_INFO_SIZE 4096 | ||
24 | |||
25 | #define BTRFS_SUPER_MIRROR_MAX 3 | ||
26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 | ||
27 | |||
28 | static inline u64 btrfs_sb_offset(int mirror) | ||
29 | { | ||
30 | u64 start = 16 * 1024; | ||
31 | if (mirror) | ||
32 | return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); | ||
33 | return BTRFS_SUPER_INFO_OFFSET; | ||
34 | } | ||
35 | |||
36 | struct btrfs_device; | ||
37 | struct btrfs_fs_devices; | ||
38 | |||
39 | struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | ||
40 | u32 blocksize, u64 parent_transid); | ||
41 | int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, | ||
42 | u64 parent_transid); | ||
43 | struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, | ||
44 | u64 bytenr, u32 blocksize); | ||
45 | int clean_tree_block(struct btrfs_trans_handle *trans, | ||
46 | struct btrfs_root *root, struct extent_buffer *buf); | ||
47 | struct btrfs_root *open_ctree(struct super_block *sb, | ||
48 | struct btrfs_fs_devices *fs_devices, | ||
49 | char *options); | ||
50 | int close_ctree(struct btrfs_root *root); | ||
51 | int write_ctree_super(struct btrfs_trans_handle *trans, | ||
52 | struct btrfs_root *root, int max_mirrors); | ||
53 | struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); | ||
54 | int btrfs_commit_super(struct btrfs_root *root); | ||
55 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, | ||
56 | u64 bytenr, u32 blocksize); | ||
57 | struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, | ||
58 | u64 root_objectid); | ||
59 | struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, | ||
60 | struct btrfs_key *location, | ||
61 | const char *name, int namelen); | ||
62 | struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, | ||
63 | struct btrfs_key *location); | ||
64 | struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, | ||
65 | struct btrfs_key *location); | ||
66 | int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); | ||
67 | int btrfs_insert_dev_radix(struct btrfs_root *root, | ||
68 | struct block_device *bdev, | ||
69 | u64 device_id, | ||
70 | u64 block_start, | ||
71 | u64 num_blocks); | ||
72 | void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); | ||
73 | int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); | ||
74 | void btrfs_mark_buffer_dirty(struct extent_buffer *buf); | ||
75 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); | ||
76 | int btrfs_set_buffer_uptodate(struct extent_buffer *buf); | ||
77 | int wait_on_tree_block_writeback(struct btrfs_root *root, | ||
78 | struct extent_buffer *buf); | ||
79 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); | ||
80 | u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); | ||
81 | void btrfs_csum_final(u32 crc, char *result); | ||
82 | int btrfs_open_device(struct btrfs_device *dev); | ||
83 | int btrfs_verify_block_csum(struct btrfs_root *root, | ||
84 | struct extent_buffer *buf); | ||
85 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | ||
86 | int metadata); | ||
87 | int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, | ||
88 | int rw, struct bio *bio, int mirror_num, | ||
89 | unsigned long bio_flags, | ||
90 | extent_submit_bio_hook_t *submit_bio_start, | ||
91 | extent_submit_bio_hook_t *submit_bio_done); | ||
92 | |||
93 | int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); | ||
94 | unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); | ||
95 | int btrfs_write_tree_block(struct extent_buffer *buf); | ||
96 | int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); | ||
97 | int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, | ||
98 | struct btrfs_fs_info *fs_info); | ||
99 | int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, | ||
100 | struct btrfs_fs_info *fs_info); | ||
101 | int btree_lock_page_hook(struct page *page); | ||
102 | #endif | ||
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 000000000000..85315d2c90de --- /dev/null +++ b/fs/btrfs/export.c | |||
@@ -0,0 +1,203 @@ | |||
1 | #include <linux/fs.h> | ||
2 | #include <linux/types.h> | ||
3 | #include "ctree.h" | ||
4 | #include "disk-io.h" | ||
5 | #include "btrfs_inode.h" | ||
6 | #include "print-tree.h" | ||
7 | #include "export.h" | ||
8 | #include "compat.h" | ||
9 | |||
10 | #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ | ||
11 | parent_objectid) / 4) | ||
12 | #define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ | ||
13 | parent_root_objectid) / 4) | ||
14 | #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) | ||
15 | |||
16 | static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, | ||
17 | int connectable) | ||
18 | { | ||
19 | struct btrfs_fid *fid = (struct btrfs_fid *)fh; | ||
20 | struct inode *inode = dentry->d_inode; | ||
21 | int len = *max_len; | ||
22 | int type; | ||
23 | |||
24 | if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || | ||
25 | (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) | ||
26 | return 255; | ||
27 | |||
28 | len = BTRFS_FID_SIZE_NON_CONNECTABLE; | ||
29 | type = FILEID_BTRFS_WITHOUT_PARENT; | ||
30 | |||
31 | fid->objectid = BTRFS_I(inode)->location.objectid; | ||
32 | fid->root_objectid = BTRFS_I(inode)->root->objectid; | ||
33 | fid->gen = inode->i_generation; | ||
34 | |||
35 | if (connectable && !S_ISDIR(inode->i_mode)) { | ||
36 | struct inode *parent; | ||
37 | u64 parent_root_id; | ||
38 | |||
39 | spin_lock(&dentry->d_lock); | ||
40 | |||
41 | parent = dentry->d_parent->d_inode; | ||
42 | fid->parent_objectid = BTRFS_I(parent)->location.objectid; | ||
43 | fid->parent_gen = parent->i_generation; | ||
44 | parent_root_id = BTRFS_I(parent)->root->objectid; | ||
45 | |||
46 | spin_unlock(&dentry->d_lock); | ||
47 | |||
48 | if (parent_root_id != fid->root_objectid) { | ||
49 | fid->parent_root_objectid = parent_root_id; | ||
50 | len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; | ||
51 | type = FILEID_BTRFS_WITH_PARENT_ROOT; | ||
52 | } else { | ||
53 | len = BTRFS_FID_SIZE_CONNECTABLE; | ||
54 | type = FILEID_BTRFS_WITH_PARENT; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | *max_len = len; | ||
59 | return type; | ||
60 | } | ||
61 | |||
62 | static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, | ||
63 | u64 root_objectid, u32 generation) | ||
64 | { | ||
65 | struct btrfs_root *root; | ||
66 | struct inode *inode; | ||
67 | struct btrfs_key key; | ||
68 | |||
69 | key.objectid = root_objectid; | ||
70 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
71 | key.offset = (u64)-1; | ||
72 | |||
73 | root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); | ||
74 | if (IS_ERR(root)) | ||
75 | return ERR_CAST(root); | ||
76 | |||
77 | key.objectid = objectid; | ||
78 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
79 | key.offset = 0; | ||
80 | |||
81 | inode = btrfs_iget(sb, &key, root, NULL); | ||
82 | if (IS_ERR(inode)) | ||
83 | return (void *)inode; | ||
84 | |||
85 | if (generation != inode->i_generation) { | ||
86 | iput(inode); | ||
87 | return ERR_PTR(-ESTALE); | ||
88 | } | ||
89 | |||
90 | return d_obtain_alias(inode); | ||
91 | } | ||
92 | |||
93 | static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, | ||
94 | int fh_len, int fh_type) | ||
95 | { | ||
96 | struct btrfs_fid *fid = (struct btrfs_fid *) fh; | ||
97 | u64 objectid, root_objectid; | ||
98 | u32 generation; | ||
99 | |||
100 | if (fh_type == FILEID_BTRFS_WITH_PARENT) { | ||
101 | if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) | ||
102 | return NULL; | ||
103 | root_objectid = fid->root_objectid; | ||
104 | } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { | ||
105 | if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) | ||
106 | return NULL; | ||
107 | root_objectid = fid->parent_root_objectid; | ||
108 | } else | ||
109 | return NULL; | ||
110 | |||
111 | objectid = fid->parent_objectid; | ||
112 | generation = fid->parent_gen; | ||
113 | |||
114 | return btrfs_get_dentry(sb, objectid, root_objectid, generation); | ||
115 | } | ||
116 | |||
117 | static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, | ||
118 | int fh_len, int fh_type) | ||
119 | { | ||
120 | struct btrfs_fid *fid = (struct btrfs_fid *) fh; | ||
121 | u64 objectid, root_objectid; | ||
122 | u32 generation; | ||
123 | |||
124 | if ((fh_type != FILEID_BTRFS_WITH_PARENT || | ||
125 | fh_len != BTRFS_FID_SIZE_CONNECTABLE) && | ||
126 | (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || | ||
127 | fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && | ||
128 | (fh_type != FILEID_BTRFS_WITHOUT_PARENT || | ||
129 | fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) | ||
130 | return NULL; | ||
131 | |||
132 | objectid = fid->objectid; | ||
133 | root_objectid = fid->root_objectid; | ||
134 | generation = fid->gen; | ||
135 | |||
136 | return btrfs_get_dentry(sb, objectid, root_objectid, generation); | ||
137 | } | ||
138 | |||
139 | static struct dentry *btrfs_get_parent(struct dentry *child) | ||
140 | { | ||
141 | struct inode *dir = child->d_inode; | ||
142 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
143 | struct btrfs_key key; | ||
144 | struct btrfs_path *path; | ||
145 | struct extent_buffer *leaf; | ||
146 | int slot; | ||
147 | u64 objectid; | ||
148 | int ret; | ||
149 | |||
150 | path = btrfs_alloc_path(); | ||
151 | |||
152 | key.objectid = dir->i_ino; | ||
153 | btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); | ||
154 | key.offset = (u64)-1; | ||
155 | |||
156 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
157 | if (ret < 0) { | ||
158 | /* Error */ | ||
159 | btrfs_free_path(path); | ||
160 | return ERR_PTR(ret); | ||
161 | } | ||
162 | leaf = path->nodes[0]; | ||
163 | slot = path->slots[0]; | ||
164 | if (ret) { | ||
165 | /* btrfs_search_slot() returns the slot where we'd want to | ||
166 | insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. | ||
167 | The _real_ backref, telling us what the parent inode | ||
168 | _actually_ is, will be in the slot _before_ the one | ||
169 | that btrfs_search_slot() returns. */ | ||
170 | if (!slot) { | ||
171 | /* Unless there is _no_ key in the tree before... */ | ||
172 | btrfs_free_path(path); | ||
173 | return ERR_PTR(-EIO); | ||
174 | } | ||
175 | slot--; | ||
176 | } | ||
177 | |||
178 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
179 | btrfs_free_path(path); | ||
180 | |||
181 | if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) | ||
182 | return ERR_PTR(-EINVAL); | ||
183 | |||
184 | objectid = key.offset; | ||
185 | |||
186 | /* If we are already at the root of a subvol, return the real root */ | ||
187 | if (objectid == dir->i_ino) | ||
188 | return dget(dir->i_sb->s_root); | ||
189 | |||
190 | /* Build a new key for the inode item */ | ||
191 | key.objectid = objectid; | ||
192 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
193 | key.offset = 0; | ||
194 | |||
195 | return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); | ||
196 | } | ||
197 | |||
198 | const struct export_operations btrfs_export_ops = { | ||
199 | .encode_fh = btrfs_encode_fh, | ||
200 | .fh_to_dentry = btrfs_fh_to_dentry, | ||
201 | .fh_to_parent = btrfs_fh_to_parent, | ||
202 | .get_parent = btrfs_get_parent, | ||
203 | }; | ||
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 000000000000..074348a95841 --- /dev/null +++ b/fs/btrfs/export.h | |||
@@ -0,0 +1,19 @@ | |||
1 | #ifndef BTRFS_EXPORT_H | ||
2 | #define BTRFS_EXPORT_H | ||
3 | |||
4 | #include <linux/exportfs.h> | ||
5 | |||
6 | extern const struct export_operations btrfs_export_ops; | ||
7 | |||
8 | struct btrfs_fid { | ||
9 | u64 objectid; | ||
10 | u64 root_objectid; | ||
11 | u32 gen; | ||
12 | |||
13 | u64 parent_objectid; | ||
14 | u32 parent_gen; | ||
15 | |||
16 | u64 parent_root_objectid; | ||
17 | } __attribute__ ((packed)); | ||
18 | |||
19 | #endif | ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 000000000000..293da650873f --- /dev/null +++ b/fs/btrfs/extent-tree.c | |||
@@ -0,0 +1,5986 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/pagemap.h> | ||
20 | #include <linux/writeback.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/version.h> | ||
23 | #include "compat.h" | ||
24 | #include "hash.h" | ||
25 | #include "crc32c.h" | ||
26 | #include "ctree.h" | ||
27 | #include "disk-io.h" | ||
28 | #include "print-tree.h" | ||
29 | #include "transaction.h" | ||
30 | #include "volumes.h" | ||
31 | #include "locking.h" | ||
32 | #include "ref-cache.h" | ||
33 | #include "compat.h" | ||
34 | |||
35 | #define PENDING_EXTENT_INSERT 0 | ||
36 | #define PENDING_EXTENT_DELETE 1 | ||
37 | #define PENDING_BACKREF_UPDATE 2 | ||
38 | |||
39 | struct pending_extent_op { | ||
40 | int type; | ||
41 | u64 bytenr; | ||
42 | u64 num_bytes; | ||
43 | u64 parent; | ||
44 | u64 orig_parent; | ||
45 | u64 generation; | ||
46 | u64 orig_generation; | ||
47 | int level; | ||
48 | struct list_head list; | ||
49 | int del; | ||
50 | }; | ||
51 | |||
52 | static int finish_current_insert(struct btrfs_trans_handle *trans, | ||
53 | struct btrfs_root *extent_root, int all); | ||
54 | static int del_pending_extents(struct btrfs_trans_handle *trans, | ||
55 | struct btrfs_root *extent_root, int all); | ||
56 | static int pin_down_bytes(struct btrfs_trans_handle *trans, | ||
57 | struct btrfs_root *root, | ||
58 | u64 bytenr, u64 num_bytes, int is_data); | ||
59 | static int update_block_group(struct btrfs_trans_handle *trans, | ||
60 | struct btrfs_root *root, | ||
61 | u64 bytenr, u64 num_bytes, int alloc, | ||
62 | int mark_free); | ||
63 | |||
64 | static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) | ||
65 | { | ||
66 | return (cache->flags & bits) == bits; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * this adds the block group to the fs_info rb tree for the block group | ||
71 | * cache | ||
72 | */ | ||
73 | static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, | ||
74 | struct btrfs_block_group_cache *block_group) | ||
75 | { | ||
76 | struct rb_node **p; | ||
77 | struct rb_node *parent = NULL; | ||
78 | struct btrfs_block_group_cache *cache; | ||
79 | |||
80 | spin_lock(&info->block_group_cache_lock); | ||
81 | p = &info->block_group_cache_tree.rb_node; | ||
82 | |||
83 | while (*p) { | ||
84 | parent = *p; | ||
85 | cache = rb_entry(parent, struct btrfs_block_group_cache, | ||
86 | cache_node); | ||
87 | if (block_group->key.objectid < cache->key.objectid) { | ||
88 | p = &(*p)->rb_left; | ||
89 | } else if (block_group->key.objectid > cache->key.objectid) { | ||
90 | p = &(*p)->rb_right; | ||
91 | } else { | ||
92 | spin_unlock(&info->block_group_cache_lock); | ||
93 | return -EEXIST; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | rb_link_node(&block_group->cache_node, parent, p); | ||
98 | rb_insert_color(&block_group->cache_node, | ||
99 | &info->block_group_cache_tree); | ||
100 | spin_unlock(&info->block_group_cache_lock); | ||
101 | |||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * This will return the block group at or after bytenr if contains is 0, else | ||
107 | * it will return the block group that contains the bytenr | ||
108 | */ | ||
109 | static struct btrfs_block_group_cache * | ||
110 | block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, | ||
111 | int contains) | ||
112 | { | ||
113 | struct btrfs_block_group_cache *cache, *ret = NULL; | ||
114 | struct rb_node *n; | ||
115 | u64 end, start; | ||
116 | |||
117 | spin_lock(&info->block_group_cache_lock); | ||
118 | n = info->block_group_cache_tree.rb_node; | ||
119 | |||
120 | while (n) { | ||
121 | cache = rb_entry(n, struct btrfs_block_group_cache, | ||
122 | cache_node); | ||
123 | end = cache->key.objectid + cache->key.offset - 1; | ||
124 | start = cache->key.objectid; | ||
125 | |||
126 | if (bytenr < start) { | ||
127 | if (!contains && (!ret || start < ret->key.objectid)) | ||
128 | ret = cache; | ||
129 | n = n->rb_left; | ||
130 | } else if (bytenr > start) { | ||
131 | if (contains && bytenr <= end) { | ||
132 | ret = cache; | ||
133 | break; | ||
134 | } | ||
135 | n = n->rb_right; | ||
136 | } else { | ||
137 | ret = cache; | ||
138 | break; | ||
139 | } | ||
140 | } | ||
141 | if (ret) | ||
142 | atomic_inc(&ret->count); | ||
143 | spin_unlock(&info->block_group_cache_lock); | ||
144 | |||
145 | return ret; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * this is only called by cache_block_group, since we could have freed extents | ||
150 | * we need to check the pinned_extents for any extents that can't be used yet | ||
151 | * since their free space will be released as soon as the transaction commits. | ||
152 | */ | ||
153 | static int add_new_free_space(struct btrfs_block_group_cache *block_group, | ||
154 | struct btrfs_fs_info *info, u64 start, u64 end) | ||
155 | { | ||
156 | u64 extent_start, extent_end, size; | ||
157 | int ret; | ||
158 | |||
159 | mutex_lock(&info->pinned_mutex); | ||
160 | while (start < end) { | ||
161 | ret = find_first_extent_bit(&info->pinned_extents, start, | ||
162 | &extent_start, &extent_end, | ||
163 | EXTENT_DIRTY); | ||
164 | if (ret) | ||
165 | break; | ||
166 | |||
167 | if (extent_start == start) { | ||
168 | start = extent_end + 1; | ||
169 | } else if (extent_start > start && extent_start < end) { | ||
170 | size = extent_start - start; | ||
171 | ret = btrfs_add_free_space(block_group, start, | ||
172 | size); | ||
173 | BUG_ON(ret); | ||
174 | start = extent_end + 1; | ||
175 | } else { | ||
176 | break; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | if (start < end) { | ||
181 | size = end - start; | ||
182 | ret = btrfs_add_free_space(block_group, start, size); | ||
183 | BUG_ON(ret); | ||
184 | } | ||
185 | mutex_unlock(&info->pinned_mutex); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static int remove_sb_from_cache(struct btrfs_root *root, | ||
191 | struct btrfs_block_group_cache *cache) | ||
192 | { | ||
193 | u64 bytenr; | ||
194 | u64 *logical; | ||
195 | int stripe_len; | ||
196 | int i, nr, ret; | ||
197 | |||
198 | for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { | ||
199 | bytenr = btrfs_sb_offset(i); | ||
200 | ret = btrfs_rmap_block(&root->fs_info->mapping_tree, | ||
201 | cache->key.objectid, bytenr, 0, | ||
202 | &logical, &nr, &stripe_len); | ||
203 | BUG_ON(ret); | ||
204 | while (nr--) { | ||
205 | btrfs_remove_free_space(cache, logical[nr], | ||
206 | stripe_len); | ||
207 | } | ||
208 | kfree(logical); | ||
209 | } | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | static int cache_block_group(struct btrfs_root *root, | ||
214 | struct btrfs_block_group_cache *block_group) | ||
215 | { | ||
216 | struct btrfs_path *path; | ||
217 | int ret = 0; | ||
218 | struct btrfs_key key; | ||
219 | struct extent_buffer *leaf; | ||
220 | int slot; | ||
221 | u64 last; | ||
222 | |||
223 | if (!block_group) | ||
224 | return 0; | ||
225 | |||
226 | root = root->fs_info->extent_root; | ||
227 | |||
228 | if (block_group->cached) | ||
229 | return 0; | ||
230 | |||
231 | path = btrfs_alloc_path(); | ||
232 | if (!path) | ||
233 | return -ENOMEM; | ||
234 | |||
235 | path->reada = 2; | ||
236 | /* | ||
237 | * we get into deadlocks with paths held by callers of this function. | ||
238 | * since the alloc_mutex is protecting things right now, just | ||
239 | * skip the locking here | ||
240 | */ | ||
241 | path->skip_locking = 1; | ||
242 | last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); | ||
243 | key.objectid = last; | ||
244 | key.offset = 0; | ||
245 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); | ||
246 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
247 | if (ret < 0) | ||
248 | goto err; | ||
249 | |||
250 | while (1) { | ||
251 | leaf = path->nodes[0]; | ||
252 | slot = path->slots[0]; | ||
253 | if (slot >= btrfs_header_nritems(leaf)) { | ||
254 | ret = btrfs_next_leaf(root, path); | ||
255 | if (ret < 0) | ||
256 | goto err; | ||
257 | if (ret == 0) | ||
258 | continue; | ||
259 | else | ||
260 | break; | ||
261 | } | ||
262 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
263 | if (key.objectid < block_group->key.objectid) | ||
264 | goto next; | ||
265 | |||
266 | if (key.objectid >= block_group->key.objectid + | ||
267 | block_group->key.offset) | ||
268 | break; | ||
269 | |||
270 | if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { | ||
271 | add_new_free_space(block_group, root->fs_info, last, | ||
272 | key.objectid); | ||
273 | |||
274 | last = key.objectid + key.offset; | ||
275 | } | ||
276 | next: | ||
277 | path->slots[0]++; | ||
278 | } | ||
279 | |||
280 | add_new_free_space(block_group, root->fs_info, last, | ||
281 | block_group->key.objectid + | ||
282 | block_group->key.offset); | ||
283 | |||
284 | remove_sb_from_cache(root, block_group); | ||
285 | block_group->cached = 1; | ||
286 | ret = 0; | ||
287 | err: | ||
288 | btrfs_free_path(path); | ||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * return the block group that starts at or after bytenr | ||
294 | */ | ||
295 | static struct btrfs_block_group_cache * | ||
296 | btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) | ||
297 | { | ||
298 | struct btrfs_block_group_cache *cache; | ||
299 | |||
300 | cache = block_group_cache_tree_search(info, bytenr, 0); | ||
301 | |||
302 | return cache; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * return the block group that contains teh given bytenr | ||
307 | */ | ||
308 | struct btrfs_block_group_cache *btrfs_lookup_block_group( | ||
309 | struct btrfs_fs_info *info, | ||
310 | u64 bytenr) | ||
311 | { | ||
312 | struct btrfs_block_group_cache *cache; | ||
313 | |||
314 | cache = block_group_cache_tree_search(info, bytenr, 1); | ||
315 | |||
316 | return cache; | ||
317 | } | ||
318 | |||
319 | static inline void put_block_group(struct btrfs_block_group_cache *cache) | ||
320 | { | ||
321 | if (atomic_dec_and_test(&cache->count)) | ||
322 | kfree(cache); | ||
323 | } | ||
324 | |||
325 | static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, | ||
326 | u64 flags) | ||
327 | { | ||
328 | struct list_head *head = &info->space_info; | ||
329 | struct list_head *cur; | ||
330 | struct btrfs_space_info *found; | ||
331 | list_for_each(cur, head) { | ||
332 | found = list_entry(cur, struct btrfs_space_info, list); | ||
333 | if (found->flags == flags) | ||
334 | return found; | ||
335 | } | ||
336 | return NULL; | ||
337 | } | ||
338 | |||
339 | static u64 div_factor(u64 num, int factor) | ||
340 | { | ||
341 | if (factor == 10) | ||
342 | return num; | ||
343 | num *= factor; | ||
344 | do_div(num, 10); | ||
345 | return num; | ||
346 | } | ||
347 | |||
348 | u64 btrfs_find_block_group(struct btrfs_root *root, | ||
349 | u64 search_start, u64 search_hint, int owner) | ||
350 | { | ||
351 | struct btrfs_block_group_cache *cache; | ||
352 | u64 used; | ||
353 | u64 last = max(search_hint, search_start); | ||
354 | u64 group_start = 0; | ||
355 | int full_search = 0; | ||
356 | int factor = 9; | ||
357 | int wrapped = 0; | ||
358 | again: | ||
359 | while (1) { | ||
360 | cache = btrfs_lookup_first_block_group(root->fs_info, last); | ||
361 | if (!cache) | ||
362 | break; | ||
363 | |||
364 | spin_lock(&cache->lock); | ||
365 | last = cache->key.objectid + cache->key.offset; | ||
366 | used = btrfs_block_group_used(&cache->item); | ||
367 | |||
368 | if ((full_search || !cache->ro) && | ||
369 | block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { | ||
370 | if (used + cache->pinned + cache->reserved < | ||
371 | div_factor(cache->key.offset, factor)) { | ||
372 | group_start = cache->key.objectid; | ||
373 | spin_unlock(&cache->lock); | ||
374 | put_block_group(cache); | ||
375 | goto found; | ||
376 | } | ||
377 | } | ||
378 | spin_unlock(&cache->lock); | ||
379 | put_block_group(cache); | ||
380 | cond_resched(); | ||
381 | } | ||
382 | if (!wrapped) { | ||
383 | last = search_start; | ||
384 | wrapped = 1; | ||
385 | goto again; | ||
386 | } | ||
387 | if (!full_search && factor < 10) { | ||
388 | last = search_start; | ||
389 | full_search = 1; | ||
390 | factor = 10; | ||
391 | goto again; | ||
392 | } | ||
393 | found: | ||
394 | return group_start; | ||
395 | } | ||
396 | |||
397 | /* simple helper to search for an existing extent at a given offset */ | ||
398 | int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) | ||
399 | { | ||
400 | int ret; | ||
401 | struct btrfs_key key; | ||
402 | struct btrfs_path *path; | ||
403 | |||
404 | path = btrfs_alloc_path(); | ||
405 | BUG_ON(!path); | ||
406 | key.objectid = start; | ||
407 | key.offset = len; | ||
408 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); | ||
409 | ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, | ||
410 | 0, 0); | ||
411 | btrfs_free_path(path); | ||
412 | return ret; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * Back reference rules. Back refs have three main goals: | ||
417 | * | ||
418 | * 1) differentiate between all holders of references to an extent so that | ||
419 | * when a reference is dropped we can make sure it was a valid reference | ||
420 | * before freeing the extent. | ||
421 | * | ||
422 | * 2) Provide enough information to quickly find the holders of an extent | ||
423 | * if we notice a given block is corrupted or bad. | ||
424 | * | ||
425 | * 3) Make it easy to migrate blocks for FS shrinking or storage pool | ||
426 | * maintenance. This is actually the same as #2, but with a slightly | ||
427 | * different use case. | ||
428 | * | ||
429 | * File extents can be referenced by: | ||
430 | * | ||
431 | * - multiple snapshots, subvolumes, or different generations in one subvol | ||
432 | * - different files inside a single subvolume | ||
433 | * - different offsets inside a file (bookend extents in file.c) | ||
434 | * | ||
435 | * The extent ref structure has fields for: | ||
436 | * | ||
437 | * - Objectid of the subvolume root | ||
438 | * - Generation number of the tree holding the reference | ||
439 | * - objectid of the file holding the reference | ||
440 | * - number of references holding by parent node (alway 1 for tree blocks) | ||
441 | * | ||
442 | * Btree leaf may hold multiple references to a file extent. In most cases, | ||
443 | * these references are from same file and the corresponding offsets inside | ||
444 | * the file are close together. | ||
445 | * | ||
446 | * When a file extent is allocated the fields are filled in: | ||
447 | * (root_key.objectid, trans->transid, inode objectid, 1) | ||
448 | * | ||
449 | * When a leaf is cow'd new references are added for every file extent found | ||
450 | * in the leaf. It looks similar to the create case, but trans->transid will | ||
451 | * be different when the block is cow'd. | ||
452 | * | ||
453 | * (root_key.objectid, trans->transid, inode objectid, | ||
454 | * number of references in the leaf) | ||
455 | * | ||
456 | * When a file extent is removed either during snapshot deletion or | ||
457 | * file truncation, we find the corresponding back reference and check | ||
458 | * the following fields: | ||
459 | * | ||
460 | * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), | ||
461 | * inode objectid) | ||
462 | * | ||
463 | * Btree extents can be referenced by: | ||
464 | * | ||
465 | * - Different subvolumes | ||
466 | * - Different generations of the same subvolume | ||
467 | * | ||
468 | * When a tree block is created, back references are inserted: | ||
469 | * | ||
470 | * (root->root_key.objectid, trans->transid, level, 1) | ||
471 | * | ||
472 | * When a tree block is cow'd, new back references are added for all the | ||
473 | * blocks it points to. If the tree block isn't in reference counted root, | ||
474 | * the old back references are removed. These new back references are of | ||
475 | * the form (trans->transid will have increased since creation): | ||
476 | * | ||
477 | * (root->root_key.objectid, trans->transid, level, 1) | ||
478 | * | ||
479 | * When a backref is in deleting, the following fields are checked: | ||
480 | * | ||
481 | * if backref was for a tree root: | ||
482 | * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) | ||
483 | * else | ||
484 | * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) | ||
485 | * | ||
486 | * Back Reference Key composing: | ||
487 | * | ||
488 | * The key objectid corresponds to the first byte in the extent, the key | ||
489 | * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first | ||
490 | * byte of parent extent. If a extent is tree root, the key offset is set | ||
491 | * to the key objectid. | ||
492 | */ | ||
493 | |||
494 | static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, | ||
495 | struct btrfs_root *root, | ||
496 | struct btrfs_path *path, | ||
497 | u64 bytenr, u64 parent, | ||
498 | u64 ref_root, u64 ref_generation, | ||
499 | u64 owner_objectid, int del) | ||
500 | { | ||
501 | struct btrfs_key key; | ||
502 | struct btrfs_extent_ref *ref; | ||
503 | struct extent_buffer *leaf; | ||
504 | u64 ref_objectid; | ||
505 | int ret; | ||
506 | |||
507 | key.objectid = bytenr; | ||
508 | key.type = BTRFS_EXTENT_REF_KEY; | ||
509 | key.offset = parent; | ||
510 | |||
511 | ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); | ||
512 | if (ret < 0) | ||
513 | goto out; | ||
514 | if (ret > 0) { | ||
515 | ret = -ENOENT; | ||
516 | goto out; | ||
517 | } | ||
518 | |||
519 | leaf = path->nodes[0]; | ||
520 | ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); | ||
521 | ref_objectid = btrfs_ref_objectid(leaf, ref); | ||
522 | if (btrfs_ref_root(leaf, ref) != ref_root || | ||
523 | btrfs_ref_generation(leaf, ref) != ref_generation || | ||
524 | (ref_objectid != owner_objectid && | ||
525 | ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { | ||
526 | ret = -EIO; | ||
527 | WARN_ON(1); | ||
528 | goto out; | ||
529 | } | ||
530 | ret = 0; | ||
531 | out: | ||
532 | return ret; | ||
533 | } | ||
534 | |||
535 | /* | ||
536 | * updates all the backrefs that are pending on update_list for the | ||
537 | * extent_root | ||
538 | */ | ||
539 | static noinline int update_backrefs(struct btrfs_trans_handle *trans, | ||
540 | struct btrfs_root *extent_root, | ||
541 | struct btrfs_path *path, | ||
542 | struct list_head *update_list) | ||
543 | { | ||
544 | struct btrfs_key key; | ||
545 | struct btrfs_extent_ref *ref; | ||
546 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
547 | struct pending_extent_op *op; | ||
548 | struct extent_buffer *leaf; | ||
549 | int ret = 0; | ||
550 | struct list_head *cur = update_list->next; | ||
551 | u64 ref_objectid; | ||
552 | u64 ref_root = extent_root->root_key.objectid; | ||
553 | |||
554 | op = list_entry(cur, struct pending_extent_op, list); | ||
555 | |||
556 | search: | ||
557 | key.objectid = op->bytenr; | ||
558 | key.type = BTRFS_EXTENT_REF_KEY; | ||
559 | key.offset = op->orig_parent; | ||
560 | |||
561 | ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); | ||
562 | BUG_ON(ret); | ||
563 | |||
564 | leaf = path->nodes[0]; | ||
565 | |||
566 | loop: | ||
567 | ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); | ||
568 | |||
569 | ref_objectid = btrfs_ref_objectid(leaf, ref); | ||
570 | |||
571 | if (btrfs_ref_root(leaf, ref) != ref_root || | ||
572 | btrfs_ref_generation(leaf, ref) != op->orig_generation || | ||
573 | (ref_objectid != op->level && | ||
574 | ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { | ||
575 | printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " | ||
576 | "root %llu, owner %u\n", | ||
577 | (unsigned long long)op->bytenr, | ||
578 | (unsigned long long)op->orig_parent, | ||
579 | (unsigned long long)ref_root, op->level); | ||
580 | btrfs_print_leaf(extent_root, leaf); | ||
581 | BUG(); | ||
582 | } | ||
583 | |||
584 | key.objectid = op->bytenr; | ||
585 | key.offset = op->parent; | ||
586 | key.type = BTRFS_EXTENT_REF_KEY; | ||
587 | ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); | ||
588 | BUG_ON(ret); | ||
589 | ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); | ||
590 | btrfs_set_ref_generation(leaf, ref, op->generation); | ||
591 | |||
592 | cur = cur->next; | ||
593 | |||
594 | list_del_init(&op->list); | ||
595 | unlock_extent(&info->extent_ins, op->bytenr, | ||
596 | op->bytenr + op->num_bytes - 1, GFP_NOFS); | ||
597 | kfree(op); | ||
598 | |||
599 | if (cur == update_list) { | ||
600 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
601 | btrfs_release_path(extent_root, path); | ||
602 | goto out; | ||
603 | } | ||
604 | |||
605 | op = list_entry(cur, struct pending_extent_op, list); | ||
606 | |||
607 | path->slots[0]++; | ||
608 | while (path->slots[0] < btrfs_header_nritems(leaf)) { | ||
609 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
610 | if (key.objectid == op->bytenr && | ||
611 | key.type == BTRFS_EXTENT_REF_KEY) | ||
612 | goto loop; | ||
613 | path->slots[0]++; | ||
614 | } | ||
615 | |||
616 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
617 | btrfs_release_path(extent_root, path); | ||
618 | goto search; | ||
619 | |||
620 | out: | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | static noinline int insert_extents(struct btrfs_trans_handle *trans, | ||
625 | struct btrfs_root *extent_root, | ||
626 | struct btrfs_path *path, | ||
627 | struct list_head *insert_list, int nr) | ||
628 | { | ||
629 | struct btrfs_key *keys; | ||
630 | u32 *data_size; | ||
631 | struct pending_extent_op *op; | ||
632 | struct extent_buffer *leaf; | ||
633 | struct list_head *cur = insert_list->next; | ||
634 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
635 | u64 ref_root = extent_root->root_key.objectid; | ||
636 | int i = 0, last = 0, ret; | ||
637 | int total = nr * 2; | ||
638 | |||
639 | if (!nr) | ||
640 | return 0; | ||
641 | |||
642 | keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); | ||
643 | if (!keys) | ||
644 | return -ENOMEM; | ||
645 | |||
646 | data_size = kzalloc(total * sizeof(u32), GFP_NOFS); | ||
647 | if (!data_size) { | ||
648 | kfree(keys); | ||
649 | return -ENOMEM; | ||
650 | } | ||
651 | |||
652 | list_for_each_entry(op, insert_list, list) { | ||
653 | keys[i].objectid = op->bytenr; | ||
654 | keys[i].offset = op->num_bytes; | ||
655 | keys[i].type = BTRFS_EXTENT_ITEM_KEY; | ||
656 | data_size[i] = sizeof(struct btrfs_extent_item); | ||
657 | i++; | ||
658 | |||
659 | keys[i].objectid = op->bytenr; | ||
660 | keys[i].offset = op->parent; | ||
661 | keys[i].type = BTRFS_EXTENT_REF_KEY; | ||
662 | data_size[i] = sizeof(struct btrfs_extent_ref); | ||
663 | i++; | ||
664 | } | ||
665 | |||
666 | op = list_entry(cur, struct pending_extent_op, list); | ||
667 | i = 0; | ||
668 | while (i < total) { | ||
669 | int c; | ||
670 | ret = btrfs_insert_some_items(trans, extent_root, path, | ||
671 | keys+i, data_size+i, total-i); | ||
672 | BUG_ON(ret < 0); | ||
673 | |||
674 | if (last && ret > 1) | ||
675 | BUG(); | ||
676 | |||
677 | leaf = path->nodes[0]; | ||
678 | for (c = 0; c < ret; c++) { | ||
679 | int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; | ||
680 | |||
681 | /* | ||
682 | * if the first item we inserted was a backref, then | ||
683 | * the EXTENT_ITEM will be the odd c's, else it will | ||
684 | * be the even c's | ||
685 | */ | ||
686 | if ((ref_first && (c % 2)) || | ||
687 | (!ref_first && !(c % 2))) { | ||
688 | struct btrfs_extent_item *itm; | ||
689 | |||
690 | itm = btrfs_item_ptr(leaf, path->slots[0] + c, | ||
691 | struct btrfs_extent_item); | ||
692 | btrfs_set_extent_refs(path->nodes[0], itm, 1); | ||
693 | op->del++; | ||
694 | } else { | ||
695 | struct btrfs_extent_ref *ref; | ||
696 | |||
697 | ref = btrfs_item_ptr(leaf, path->slots[0] + c, | ||
698 | struct btrfs_extent_ref); | ||
699 | btrfs_set_ref_root(leaf, ref, ref_root); | ||
700 | btrfs_set_ref_generation(leaf, ref, | ||
701 | op->generation); | ||
702 | btrfs_set_ref_objectid(leaf, ref, op->level); | ||
703 | btrfs_set_ref_num_refs(leaf, ref, 1); | ||
704 | op->del++; | ||
705 | } | ||
706 | |||
707 | /* | ||
708 | * using del to see when its ok to free up the | ||
709 | * pending_extent_op. In the case where we insert the | ||
710 | * last item on the list in order to help do batching | ||
711 | * we need to not free the extent op until we actually | ||
712 | * insert the extent_item | ||
713 | */ | ||
714 | if (op->del == 2) { | ||
715 | unlock_extent(&info->extent_ins, op->bytenr, | ||
716 | op->bytenr + op->num_bytes - 1, | ||
717 | GFP_NOFS); | ||
718 | cur = cur->next; | ||
719 | list_del_init(&op->list); | ||
720 | kfree(op); | ||
721 | if (cur != insert_list) | ||
722 | op = list_entry(cur, | ||
723 | struct pending_extent_op, | ||
724 | list); | ||
725 | } | ||
726 | } | ||
727 | btrfs_mark_buffer_dirty(leaf); | ||
728 | btrfs_release_path(extent_root, path); | ||
729 | |||
730 | /* | ||
731 | * Ok backref's and items usually go right next to eachother, | ||
732 | * but if we could only insert 1 item that means that we | ||
733 | * inserted on the end of a leaf, and we have no idea what may | ||
734 | * be on the next leaf so we just play it safe. In order to | ||
735 | * try and help this case we insert the last thing on our | ||
736 | * insert list so hopefully it will end up being the last | ||
737 | * thing on the leaf and everything else will be before it, | ||
738 | * which will let us insert a whole bunch of items at the same | ||
739 | * time. | ||
740 | */ | ||
741 | if (ret == 1 && !last && (i + ret < total)) { | ||
742 | /* | ||
743 | * last: where we will pick up the next time around | ||
744 | * i: our current key to insert, will be total - 1 | ||
745 | * cur: the current op we are screwing with | ||
746 | * op: duh | ||
747 | */ | ||
748 | last = i + ret; | ||
749 | i = total - 1; | ||
750 | cur = insert_list->prev; | ||
751 | op = list_entry(cur, struct pending_extent_op, list); | ||
752 | } else if (last) { | ||
753 | /* | ||
754 | * ok we successfully inserted the last item on the | ||
755 | * list, lets reset everything | ||
756 | * | ||
757 | * i: our current key to insert, so where we left off | ||
758 | * last time | ||
759 | * last: done with this | ||
760 | * cur: the op we are messing with | ||
761 | * op: duh | ||
762 | * total: since we inserted the last key, we need to | ||
763 | * decrement total so we dont overflow | ||
764 | */ | ||
765 | i = last; | ||
766 | last = 0; | ||
767 | total--; | ||
768 | if (i < total) { | ||
769 | cur = insert_list->next; | ||
770 | op = list_entry(cur, struct pending_extent_op, | ||
771 | list); | ||
772 | } | ||
773 | } else { | ||
774 | i += ret; | ||
775 | } | ||
776 | |||
777 | cond_resched(); | ||
778 | } | ||
779 | ret = 0; | ||
780 | kfree(keys); | ||
781 | kfree(data_size); | ||
782 | return ret; | ||
783 | } | ||
784 | |||
785 | static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, | ||
786 | struct btrfs_root *root, | ||
787 | struct btrfs_path *path, | ||
788 | u64 bytenr, u64 parent, | ||
789 | u64 ref_root, u64 ref_generation, | ||
790 | u64 owner_objectid) | ||
791 | { | ||
792 | struct btrfs_key key; | ||
793 | struct extent_buffer *leaf; | ||
794 | struct btrfs_extent_ref *ref; | ||
795 | u32 num_refs; | ||
796 | int ret; | ||
797 | |||
798 | key.objectid = bytenr; | ||
799 | key.type = BTRFS_EXTENT_REF_KEY; | ||
800 | key.offset = parent; | ||
801 | |||
802 | ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); | ||
803 | if (ret == 0) { | ||
804 | leaf = path->nodes[0]; | ||
805 | ref = btrfs_item_ptr(leaf, path->slots[0], | ||
806 | struct btrfs_extent_ref); | ||
807 | btrfs_set_ref_root(leaf, ref, ref_root); | ||
808 | btrfs_set_ref_generation(leaf, ref, ref_generation); | ||
809 | btrfs_set_ref_objectid(leaf, ref, owner_objectid); | ||
810 | btrfs_set_ref_num_refs(leaf, ref, 1); | ||
811 | } else if (ret == -EEXIST) { | ||
812 | u64 existing_owner; | ||
813 | BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); | ||
814 | leaf = path->nodes[0]; | ||
815 | ref = btrfs_item_ptr(leaf, path->slots[0], | ||
816 | struct btrfs_extent_ref); | ||
817 | if (btrfs_ref_root(leaf, ref) != ref_root || | ||
818 | btrfs_ref_generation(leaf, ref) != ref_generation) { | ||
819 | ret = -EIO; | ||
820 | WARN_ON(1); | ||
821 | goto out; | ||
822 | } | ||
823 | |||
824 | num_refs = btrfs_ref_num_refs(leaf, ref); | ||
825 | BUG_ON(num_refs == 0); | ||
826 | btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); | ||
827 | |||
828 | existing_owner = btrfs_ref_objectid(leaf, ref); | ||
829 | if (existing_owner != owner_objectid && | ||
830 | existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { | ||
831 | btrfs_set_ref_objectid(leaf, ref, | ||
832 | BTRFS_MULTIPLE_OBJECTIDS); | ||
833 | } | ||
834 | ret = 0; | ||
835 | } else { | ||
836 | goto out; | ||
837 | } | ||
838 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
839 | out: | ||
840 | btrfs_release_path(root, path); | ||
841 | return ret; | ||
842 | } | ||
843 | |||
844 | static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, | ||
845 | struct btrfs_root *root, | ||
846 | struct btrfs_path *path) | ||
847 | { | ||
848 | struct extent_buffer *leaf; | ||
849 | struct btrfs_extent_ref *ref; | ||
850 | u32 num_refs; | ||
851 | int ret = 0; | ||
852 | |||
853 | leaf = path->nodes[0]; | ||
854 | ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); | ||
855 | num_refs = btrfs_ref_num_refs(leaf, ref); | ||
856 | BUG_ON(num_refs == 0); | ||
857 | num_refs -= 1; | ||
858 | if (num_refs == 0) { | ||
859 | ret = btrfs_del_item(trans, root, path); | ||
860 | } else { | ||
861 | btrfs_set_ref_num_refs(leaf, ref, num_refs); | ||
862 | btrfs_mark_buffer_dirty(leaf); | ||
863 | } | ||
864 | btrfs_release_path(root, path); | ||
865 | return ret; | ||
866 | } | ||
867 | |||
868 | #ifdef BIO_RW_DISCARD | ||
869 | static void btrfs_issue_discard(struct block_device *bdev, | ||
870 | u64 start, u64 len) | ||
871 | { | ||
872 | blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); | ||
873 | } | ||
874 | #endif | ||
875 | |||
876 | static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | ||
877 | u64 num_bytes) | ||
878 | { | ||
879 | #ifdef BIO_RW_DISCARD | ||
880 | int ret; | ||
881 | u64 map_length = num_bytes; | ||
882 | struct btrfs_multi_bio *multi = NULL; | ||
883 | |||
884 | /* Tell the block device(s) that the sectors can be discarded */ | ||
885 | ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, | ||
886 | bytenr, &map_length, &multi, 0); | ||
887 | if (!ret) { | ||
888 | struct btrfs_bio_stripe *stripe = multi->stripes; | ||
889 | int i; | ||
890 | |||
891 | if (map_length > num_bytes) | ||
892 | map_length = num_bytes; | ||
893 | |||
894 | for (i = 0; i < multi->num_stripes; i++, stripe++) { | ||
895 | btrfs_issue_discard(stripe->dev->bdev, | ||
896 | stripe->physical, | ||
897 | map_length); | ||
898 | } | ||
899 | kfree(multi); | ||
900 | } | ||
901 | |||
902 | return ret; | ||
903 | #else | ||
904 | return 0; | ||
905 | #endif | ||
906 | } | ||
907 | |||
908 | static noinline int free_extents(struct btrfs_trans_handle *trans, | ||
909 | struct btrfs_root *extent_root, | ||
910 | struct list_head *del_list) | ||
911 | { | ||
912 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
913 | struct btrfs_path *path; | ||
914 | struct btrfs_key key, found_key; | ||
915 | struct extent_buffer *leaf; | ||
916 | struct list_head *cur; | ||
917 | struct pending_extent_op *op; | ||
918 | struct btrfs_extent_item *ei; | ||
919 | int ret, num_to_del, extent_slot = 0, found_extent = 0; | ||
920 | u32 refs; | ||
921 | u64 bytes_freed = 0; | ||
922 | |||
923 | path = btrfs_alloc_path(); | ||
924 | if (!path) | ||
925 | return -ENOMEM; | ||
926 | path->reada = 1; | ||
927 | |||
928 | search: | ||
929 | /* search for the backref for the current ref we want to delete */ | ||
930 | cur = del_list->next; | ||
931 | op = list_entry(cur, struct pending_extent_op, list); | ||
932 | ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, | ||
933 | op->orig_parent, | ||
934 | extent_root->root_key.objectid, | ||
935 | op->orig_generation, op->level, 1); | ||
936 | if (ret) { | ||
937 | printk(KERN_ERR "btrfs unable to find backref byte nr %llu " | ||
938 | "root %llu gen %llu owner %u\n", | ||
939 | (unsigned long long)op->bytenr, | ||
940 | (unsigned long long)extent_root->root_key.objectid, | ||
941 | (unsigned long long)op->orig_generation, op->level); | ||
942 | btrfs_print_leaf(extent_root, path->nodes[0]); | ||
943 | WARN_ON(1); | ||
944 | goto out; | ||
945 | } | ||
946 | |||
947 | extent_slot = path->slots[0]; | ||
948 | num_to_del = 1; | ||
949 | found_extent = 0; | ||
950 | |||
951 | /* | ||
952 | * if we aren't the first item on the leaf we can move back one and see | ||
953 | * if our ref is right next to our extent item | ||
954 | */ | ||
955 | if (likely(extent_slot)) { | ||
956 | extent_slot--; | ||
957 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
958 | extent_slot); | ||
959 | if (found_key.objectid == op->bytenr && | ||
960 | found_key.type == BTRFS_EXTENT_ITEM_KEY && | ||
961 | found_key.offset == op->num_bytes) { | ||
962 | num_to_del++; | ||
963 | found_extent = 1; | ||
964 | } | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | * if we didn't find the extent we need to delete the backref and then | ||
969 | * search for the extent item key so we can update its ref count | ||
970 | */ | ||
971 | if (!found_extent) { | ||
972 | key.objectid = op->bytenr; | ||
973 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
974 | key.offset = op->num_bytes; | ||
975 | |||
976 | ret = remove_extent_backref(trans, extent_root, path); | ||
977 | BUG_ON(ret); | ||
978 | btrfs_release_path(extent_root, path); | ||
979 | ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); | ||
980 | BUG_ON(ret); | ||
981 | extent_slot = path->slots[0]; | ||
982 | } | ||
983 | |||
984 | /* this is where we update the ref count for the extent */ | ||
985 | leaf = path->nodes[0]; | ||
986 | ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); | ||
987 | refs = btrfs_extent_refs(leaf, ei); | ||
988 | BUG_ON(refs == 0); | ||
989 | refs--; | ||
990 | btrfs_set_extent_refs(leaf, ei, refs); | ||
991 | |||
992 | btrfs_mark_buffer_dirty(leaf); | ||
993 | |||
994 | /* | ||
995 | * This extent needs deleting. The reason cur_slot is extent_slot + | ||
996 | * num_to_del is because extent_slot points to the slot where the extent | ||
997 | * is, and if the backref was not right next to the extent we will be | ||
998 | * deleting at least 1 item, and will want to start searching at the | ||
999 | * slot directly next to extent_slot. However if we did find the | ||
1000 | * backref next to the extent item them we will be deleting at least 2 | ||
1001 | * items and will want to start searching directly after the ref slot | ||
1002 | */ | ||
1003 | if (!refs) { | ||
1004 | struct list_head *pos, *n, *end; | ||
1005 | int cur_slot = extent_slot+num_to_del; | ||
1006 | u64 super_used; | ||
1007 | u64 root_used; | ||
1008 | |||
1009 | path->slots[0] = extent_slot; | ||
1010 | bytes_freed = op->num_bytes; | ||
1011 | |||
1012 | mutex_lock(&info->pinned_mutex); | ||
1013 | ret = pin_down_bytes(trans, extent_root, op->bytenr, | ||
1014 | op->num_bytes, op->level >= | ||
1015 | BTRFS_FIRST_FREE_OBJECTID); | ||
1016 | mutex_unlock(&info->pinned_mutex); | ||
1017 | BUG_ON(ret < 0); | ||
1018 | op->del = ret; | ||
1019 | |||
1020 | /* | ||
1021 | * we need to see if we can delete multiple things at once, so | ||
1022 | * start looping through the list of extents we are wanting to | ||
1023 | * delete and see if their extent/backref's are right next to | ||
1024 | * eachother and the extents only have 1 ref | ||
1025 | */ | ||
1026 | for (pos = cur->next; pos != del_list; pos = pos->next) { | ||
1027 | struct pending_extent_op *tmp; | ||
1028 | |||
1029 | tmp = list_entry(pos, struct pending_extent_op, list); | ||
1030 | |||
1031 | /* we only want to delete extent+ref at this stage */ | ||
1032 | if (cur_slot >= btrfs_header_nritems(leaf) - 1) | ||
1033 | break; | ||
1034 | |||
1035 | btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); | ||
1036 | if (found_key.objectid != tmp->bytenr || | ||
1037 | found_key.type != BTRFS_EXTENT_ITEM_KEY || | ||
1038 | found_key.offset != tmp->num_bytes) | ||
1039 | break; | ||
1040 | |||
1041 | /* check to make sure this extent only has one ref */ | ||
1042 | ei = btrfs_item_ptr(leaf, cur_slot, | ||
1043 | struct btrfs_extent_item); | ||
1044 | if (btrfs_extent_refs(leaf, ei) != 1) | ||
1045 | break; | ||
1046 | |||
1047 | btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); | ||
1048 | if (found_key.objectid != tmp->bytenr || | ||
1049 | found_key.type != BTRFS_EXTENT_REF_KEY || | ||
1050 | found_key.offset != tmp->orig_parent) | ||
1051 | break; | ||
1052 | |||
1053 | /* | ||
1054 | * the ref is right next to the extent, we can set the | ||
1055 | * ref count to 0 since we will delete them both now | ||
1056 | */ | ||
1057 | btrfs_set_extent_refs(leaf, ei, 0); | ||
1058 | |||
1059 | /* pin down the bytes for this extent */ | ||
1060 | mutex_lock(&info->pinned_mutex); | ||
1061 | ret = pin_down_bytes(trans, extent_root, tmp->bytenr, | ||
1062 | tmp->num_bytes, tmp->level >= | ||
1063 | BTRFS_FIRST_FREE_OBJECTID); | ||
1064 | mutex_unlock(&info->pinned_mutex); | ||
1065 | BUG_ON(ret < 0); | ||
1066 | |||
1067 | /* | ||
1068 | * use the del field to tell if we need to go ahead and | ||
1069 | * free up the extent when we delete the item or not. | ||
1070 | */ | ||
1071 | tmp->del = ret; | ||
1072 | bytes_freed += tmp->num_bytes; | ||
1073 | |||
1074 | num_to_del += 2; | ||
1075 | cur_slot += 2; | ||
1076 | } | ||
1077 | end = pos; | ||
1078 | |||
1079 | /* update the free space counters */ | ||
1080 | spin_lock(&info->delalloc_lock); | ||
1081 | super_used = btrfs_super_bytes_used(&info->super_copy); | ||
1082 | btrfs_set_super_bytes_used(&info->super_copy, | ||
1083 | super_used - bytes_freed); | ||
1084 | |||
1085 | root_used = btrfs_root_used(&extent_root->root_item); | ||
1086 | btrfs_set_root_used(&extent_root->root_item, | ||
1087 | root_used - bytes_freed); | ||
1088 | spin_unlock(&info->delalloc_lock); | ||
1089 | |||
1090 | /* delete the items */ | ||
1091 | ret = btrfs_del_items(trans, extent_root, path, | ||
1092 | path->slots[0], num_to_del); | ||
1093 | BUG_ON(ret); | ||
1094 | |||
1095 | /* | ||
1096 | * loop through the extents we deleted and do the cleanup work | ||
1097 | * on them | ||
1098 | */ | ||
1099 | for (pos = cur, n = pos->next; pos != end; | ||
1100 | pos = n, n = pos->next) { | ||
1101 | struct pending_extent_op *tmp; | ||
1102 | tmp = list_entry(pos, struct pending_extent_op, list); | ||
1103 | |||
1104 | /* | ||
1105 | * remember tmp->del tells us wether or not we pinned | ||
1106 | * down the extent | ||
1107 | */ | ||
1108 | ret = update_block_group(trans, extent_root, | ||
1109 | tmp->bytenr, tmp->num_bytes, 0, | ||
1110 | tmp->del); | ||
1111 | BUG_ON(ret); | ||
1112 | |||
1113 | list_del_init(&tmp->list); | ||
1114 | unlock_extent(&info->extent_ins, tmp->bytenr, | ||
1115 | tmp->bytenr + tmp->num_bytes - 1, | ||
1116 | GFP_NOFS); | ||
1117 | kfree(tmp); | ||
1118 | } | ||
1119 | } else if (refs && found_extent) { | ||
1120 | /* | ||
1121 | * the ref and extent were right next to eachother, but the | ||
1122 | * extent still has a ref, so just free the backref and keep | ||
1123 | * going | ||
1124 | */ | ||
1125 | ret = remove_extent_backref(trans, extent_root, path); | ||
1126 | BUG_ON(ret); | ||
1127 | |||
1128 | list_del_init(&op->list); | ||
1129 | unlock_extent(&info->extent_ins, op->bytenr, | ||
1130 | op->bytenr + op->num_bytes - 1, GFP_NOFS); | ||
1131 | kfree(op); | ||
1132 | } else { | ||
1133 | /* | ||
1134 | * the extent has multiple refs and the backref we were looking | ||
1135 | * for was not right next to it, so just unlock and go next, | ||
1136 | * we're good to go | ||
1137 | */ | ||
1138 | list_del_init(&op->list); | ||
1139 | unlock_extent(&info->extent_ins, op->bytenr, | ||
1140 | op->bytenr + op->num_bytes - 1, GFP_NOFS); | ||
1141 | kfree(op); | ||
1142 | } | ||
1143 | |||
1144 | btrfs_release_path(extent_root, path); | ||
1145 | if (!list_empty(del_list)) | ||
1146 | goto search; | ||
1147 | |||
1148 | out: | ||
1149 | btrfs_free_path(path); | ||
1150 | return ret; | ||
1151 | } | ||
1152 | |||
1153 | static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, | ||
1154 | struct btrfs_root *root, u64 bytenr, | ||
1155 | u64 orig_parent, u64 parent, | ||
1156 | u64 orig_root, u64 ref_root, | ||
1157 | u64 orig_generation, u64 ref_generation, | ||
1158 | u64 owner_objectid) | ||
1159 | { | ||
1160 | int ret; | ||
1161 | struct btrfs_root *extent_root = root->fs_info->extent_root; | ||
1162 | struct btrfs_path *path; | ||
1163 | |||
1164 | if (root == root->fs_info->extent_root) { | ||
1165 | struct pending_extent_op *extent_op; | ||
1166 | u64 num_bytes; | ||
1167 | |||
1168 | BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); | ||
1169 | num_bytes = btrfs_level_size(root, (int)owner_objectid); | ||
1170 | mutex_lock(&root->fs_info->extent_ins_mutex); | ||
1171 | if (test_range_bit(&root->fs_info->extent_ins, bytenr, | ||
1172 | bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { | ||
1173 | u64 priv; | ||
1174 | ret = get_state_private(&root->fs_info->extent_ins, | ||
1175 | bytenr, &priv); | ||
1176 | BUG_ON(ret); | ||
1177 | extent_op = (struct pending_extent_op *) | ||
1178 | (unsigned long)priv; | ||
1179 | BUG_ON(extent_op->parent != orig_parent); | ||
1180 | BUG_ON(extent_op->generation != orig_generation); | ||
1181 | |||
1182 | extent_op->parent = parent; | ||
1183 | extent_op->generation = ref_generation; | ||
1184 | } else { | ||
1185 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | ||
1186 | BUG_ON(!extent_op); | ||
1187 | |||
1188 | extent_op->type = PENDING_BACKREF_UPDATE; | ||
1189 | extent_op->bytenr = bytenr; | ||
1190 | extent_op->num_bytes = num_bytes; | ||
1191 | extent_op->parent = parent; | ||
1192 | extent_op->orig_parent = orig_parent; | ||
1193 | extent_op->generation = ref_generation; | ||
1194 | extent_op->orig_generation = orig_generation; | ||
1195 | extent_op->level = (int)owner_objectid; | ||
1196 | INIT_LIST_HEAD(&extent_op->list); | ||
1197 | extent_op->del = 0; | ||
1198 | |||
1199 | set_extent_bits(&root->fs_info->extent_ins, | ||
1200 | bytenr, bytenr + num_bytes - 1, | ||
1201 | EXTENT_WRITEBACK, GFP_NOFS); | ||
1202 | set_state_private(&root->fs_info->extent_ins, | ||
1203 | bytenr, (unsigned long)extent_op); | ||
1204 | } | ||
1205 | mutex_unlock(&root->fs_info->extent_ins_mutex); | ||
1206 | return 0; | ||
1207 | } | ||
1208 | |||
1209 | path = btrfs_alloc_path(); | ||
1210 | if (!path) | ||
1211 | return -ENOMEM; | ||
1212 | ret = lookup_extent_backref(trans, extent_root, path, | ||
1213 | bytenr, orig_parent, orig_root, | ||
1214 | orig_generation, owner_objectid, 1); | ||
1215 | if (ret) | ||
1216 | goto out; | ||
1217 | ret = remove_extent_backref(trans, extent_root, path); | ||
1218 | if (ret) | ||
1219 | goto out; | ||
1220 | ret = insert_extent_backref(trans, extent_root, path, bytenr, | ||
1221 | parent, ref_root, ref_generation, | ||
1222 | owner_objectid); | ||
1223 | BUG_ON(ret); | ||
1224 | finish_current_insert(trans, extent_root, 0); | ||
1225 | del_pending_extents(trans, extent_root, 0); | ||
1226 | out: | ||
1227 | btrfs_free_path(path); | ||
1228 | return ret; | ||
1229 | } | ||
1230 | |||
1231 | int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, | ||
1232 | struct btrfs_root *root, u64 bytenr, | ||
1233 | u64 orig_parent, u64 parent, | ||
1234 | u64 ref_root, u64 ref_generation, | ||
1235 | u64 owner_objectid) | ||
1236 | { | ||
1237 | int ret; | ||
1238 | if (ref_root == BTRFS_TREE_LOG_OBJECTID && | ||
1239 | owner_objectid < BTRFS_FIRST_FREE_OBJECTID) | ||
1240 | return 0; | ||
1241 | ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, | ||
1242 | parent, ref_root, ref_root, | ||
1243 | ref_generation, ref_generation, | ||
1244 | owner_objectid); | ||
1245 | return ret; | ||
1246 | } | ||
1247 | |||
1248 | static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | ||
1249 | struct btrfs_root *root, u64 bytenr, | ||
1250 | u64 orig_parent, u64 parent, | ||
1251 | u64 orig_root, u64 ref_root, | ||
1252 | u64 orig_generation, u64 ref_generation, | ||
1253 | u64 owner_objectid) | ||
1254 | { | ||
1255 | struct btrfs_path *path; | ||
1256 | int ret; | ||
1257 | struct btrfs_key key; | ||
1258 | struct extent_buffer *l; | ||
1259 | struct btrfs_extent_item *item; | ||
1260 | u32 refs; | ||
1261 | |||
1262 | path = btrfs_alloc_path(); | ||
1263 | if (!path) | ||
1264 | return -ENOMEM; | ||
1265 | |||
1266 | path->reada = 1; | ||
1267 | key.objectid = bytenr; | ||
1268 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
1269 | key.offset = (u64)-1; | ||
1270 | |||
1271 | ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, | ||
1272 | 0, 1); | ||
1273 | if (ret < 0) | ||
1274 | return ret; | ||
1275 | BUG_ON(ret == 0 || path->slots[0] == 0); | ||
1276 | |||
1277 | path->slots[0]--; | ||
1278 | l = path->nodes[0]; | ||
1279 | |||
1280 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | ||
1281 | if (key.objectid != bytenr) { | ||
1282 | btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); | ||
1283 | printk(KERN_ERR "btrfs wanted %llu found %llu\n", | ||
1284 | (unsigned long long)bytenr, | ||
1285 | (unsigned long long)key.objectid); | ||
1286 | BUG(); | ||
1287 | } | ||
1288 | BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); | ||
1289 | |||
1290 | item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); | ||
1291 | refs = btrfs_extent_refs(l, item); | ||
1292 | btrfs_set_extent_refs(l, item, refs + 1); | ||
1293 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
1294 | |||
1295 | btrfs_release_path(root->fs_info->extent_root, path); | ||
1296 | |||
1297 | path->reada = 1; | ||
1298 | ret = insert_extent_backref(trans, root->fs_info->extent_root, | ||
1299 | path, bytenr, parent, | ||
1300 | ref_root, ref_generation, | ||
1301 | owner_objectid); | ||
1302 | BUG_ON(ret); | ||
1303 | finish_current_insert(trans, root->fs_info->extent_root, 0); | ||
1304 | del_pending_extents(trans, root->fs_info->extent_root, 0); | ||
1305 | |||
1306 | btrfs_free_path(path); | ||
1307 | return 0; | ||
1308 | } | ||
1309 | |||
1310 | int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | ||
1311 | struct btrfs_root *root, | ||
1312 | u64 bytenr, u64 num_bytes, u64 parent, | ||
1313 | u64 ref_root, u64 ref_generation, | ||
1314 | u64 owner_objectid) | ||
1315 | { | ||
1316 | int ret; | ||
1317 | if (ref_root == BTRFS_TREE_LOG_OBJECTID && | ||
1318 | owner_objectid < BTRFS_FIRST_FREE_OBJECTID) | ||
1319 | return 0; | ||
1320 | ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, | ||
1321 | 0, ref_root, 0, ref_generation, | ||
1322 | owner_objectid); | ||
1323 | return ret; | ||
1324 | } | ||
1325 | |||
1326 | int btrfs_extent_post_op(struct btrfs_trans_handle *trans, | ||
1327 | struct btrfs_root *root) | ||
1328 | { | ||
1329 | finish_current_insert(trans, root->fs_info->extent_root, 1); | ||
1330 | del_pending_extents(trans, root->fs_info->extent_root, 1); | ||
1331 | return 0; | ||
1332 | } | ||
1333 | |||
1334 | int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, | ||
1335 | struct btrfs_root *root, u64 bytenr, | ||
1336 | u64 num_bytes, u32 *refs) | ||
1337 | { | ||
1338 | struct btrfs_path *path; | ||
1339 | int ret; | ||
1340 | struct btrfs_key key; | ||
1341 | struct extent_buffer *l; | ||
1342 | struct btrfs_extent_item *item; | ||
1343 | |||
1344 | WARN_ON(num_bytes < root->sectorsize); | ||
1345 | path = btrfs_alloc_path(); | ||
1346 | path->reada = 1; | ||
1347 | key.objectid = bytenr; | ||
1348 | key.offset = num_bytes; | ||
1349 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); | ||
1350 | ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, | ||
1351 | 0, 0); | ||
1352 | if (ret < 0) | ||
1353 | goto out; | ||
1354 | if (ret != 0) { | ||
1355 | btrfs_print_leaf(root, path->nodes[0]); | ||
1356 | printk(KERN_INFO "btrfs failed to find block number %llu\n", | ||
1357 | (unsigned long long)bytenr); | ||
1358 | BUG(); | ||
1359 | } | ||
1360 | l = path->nodes[0]; | ||
1361 | item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); | ||
1362 | *refs = btrfs_extent_refs(l, item); | ||
1363 | out: | ||
1364 | btrfs_free_path(path); | ||
1365 | return 0; | ||
1366 | } | ||
1367 | |||
1368 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, | ||
1369 | struct btrfs_root *root, u64 objectid, u64 bytenr) | ||
1370 | { | ||
1371 | struct btrfs_root *extent_root = root->fs_info->extent_root; | ||
1372 | struct btrfs_path *path; | ||
1373 | struct extent_buffer *leaf; | ||
1374 | struct btrfs_extent_ref *ref_item; | ||
1375 | struct btrfs_key key; | ||
1376 | struct btrfs_key found_key; | ||
1377 | u64 ref_root; | ||
1378 | u64 last_snapshot; | ||
1379 | u32 nritems; | ||
1380 | int ret; | ||
1381 | |||
1382 | key.objectid = bytenr; | ||
1383 | key.offset = (u64)-1; | ||
1384 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
1385 | |||
1386 | path = btrfs_alloc_path(); | ||
1387 | ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); | ||
1388 | if (ret < 0) | ||
1389 | goto out; | ||
1390 | BUG_ON(ret == 0); | ||
1391 | |||
1392 | ret = -ENOENT; | ||
1393 | if (path->slots[0] == 0) | ||
1394 | goto out; | ||
1395 | |||
1396 | path->slots[0]--; | ||
1397 | leaf = path->nodes[0]; | ||
1398 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
1399 | |||
1400 | if (found_key.objectid != bytenr || | ||
1401 | found_key.type != BTRFS_EXTENT_ITEM_KEY) | ||
1402 | goto out; | ||
1403 | |||
1404 | last_snapshot = btrfs_root_last_snapshot(&root->root_item); | ||
1405 | while (1) { | ||
1406 | leaf = path->nodes[0]; | ||
1407 | nritems = btrfs_header_nritems(leaf); | ||
1408 | if (path->slots[0] >= nritems) { | ||
1409 | ret = btrfs_next_leaf(extent_root, path); | ||
1410 | if (ret < 0) | ||
1411 | goto out; | ||
1412 | if (ret == 0) | ||
1413 | continue; | ||
1414 | break; | ||
1415 | } | ||
1416 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
1417 | if (found_key.objectid != bytenr) | ||
1418 | break; | ||
1419 | |||
1420 | if (found_key.type != BTRFS_EXTENT_REF_KEY) { | ||
1421 | path->slots[0]++; | ||
1422 | continue; | ||
1423 | } | ||
1424 | |||
1425 | ref_item = btrfs_item_ptr(leaf, path->slots[0], | ||
1426 | struct btrfs_extent_ref); | ||
1427 | ref_root = btrfs_ref_root(leaf, ref_item); | ||
1428 | if ((ref_root != root->root_key.objectid && | ||
1429 | ref_root != BTRFS_TREE_LOG_OBJECTID) || | ||
1430 | objectid != btrfs_ref_objectid(leaf, ref_item)) { | ||
1431 | ret = 1; | ||
1432 | goto out; | ||
1433 | } | ||
1434 | if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { | ||
1435 | ret = 1; | ||
1436 | goto out; | ||
1437 | } | ||
1438 | |||
1439 | path->slots[0]++; | ||
1440 | } | ||
1441 | ret = 0; | ||
1442 | out: | ||
1443 | btrfs_free_path(path); | ||
1444 | return ret; | ||
1445 | } | ||
1446 | |||
1447 | int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1448 | struct extent_buffer *buf, u32 nr_extents) | ||
1449 | { | ||
1450 | struct btrfs_key key; | ||
1451 | struct btrfs_file_extent_item *fi; | ||
1452 | u64 root_gen; | ||
1453 | u32 nritems; | ||
1454 | int i; | ||
1455 | int level; | ||
1456 | int ret = 0; | ||
1457 | int shared = 0; | ||
1458 | |||
1459 | if (!root->ref_cows) | ||
1460 | return 0; | ||
1461 | |||
1462 | if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { | ||
1463 | shared = 0; | ||
1464 | root_gen = root->root_key.offset; | ||
1465 | } else { | ||
1466 | shared = 1; | ||
1467 | root_gen = trans->transid - 1; | ||
1468 | } | ||
1469 | |||
1470 | level = btrfs_header_level(buf); | ||
1471 | nritems = btrfs_header_nritems(buf); | ||
1472 | |||
1473 | if (level == 0) { | ||
1474 | struct btrfs_leaf_ref *ref; | ||
1475 | struct btrfs_extent_info *info; | ||
1476 | |||
1477 | ref = btrfs_alloc_leaf_ref(root, nr_extents); | ||
1478 | if (!ref) { | ||
1479 | ret = -ENOMEM; | ||
1480 | goto out; | ||
1481 | } | ||
1482 | |||
1483 | ref->root_gen = root_gen; | ||
1484 | ref->bytenr = buf->start; | ||
1485 | ref->owner = btrfs_header_owner(buf); | ||
1486 | ref->generation = btrfs_header_generation(buf); | ||
1487 | ref->nritems = nr_extents; | ||
1488 | info = ref->extents; | ||
1489 | |||
1490 | for (i = 0; nr_extents > 0 && i < nritems; i++) { | ||
1491 | u64 disk_bytenr; | ||
1492 | btrfs_item_key_to_cpu(buf, &key, i); | ||
1493 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
1494 | continue; | ||
1495 | fi = btrfs_item_ptr(buf, i, | ||
1496 | struct btrfs_file_extent_item); | ||
1497 | if (btrfs_file_extent_type(buf, fi) == | ||
1498 | BTRFS_FILE_EXTENT_INLINE) | ||
1499 | continue; | ||
1500 | disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); | ||
1501 | if (disk_bytenr == 0) | ||
1502 | continue; | ||
1503 | |||
1504 | info->bytenr = disk_bytenr; | ||
1505 | info->num_bytes = | ||
1506 | btrfs_file_extent_disk_num_bytes(buf, fi); | ||
1507 | info->objectid = key.objectid; | ||
1508 | info->offset = key.offset; | ||
1509 | info++; | ||
1510 | } | ||
1511 | |||
1512 | ret = btrfs_add_leaf_ref(root, ref, shared); | ||
1513 | if (ret == -EEXIST && shared) { | ||
1514 | struct btrfs_leaf_ref *old; | ||
1515 | old = btrfs_lookup_leaf_ref(root, ref->bytenr); | ||
1516 | BUG_ON(!old); | ||
1517 | btrfs_remove_leaf_ref(root, old); | ||
1518 | btrfs_free_leaf_ref(root, old); | ||
1519 | ret = btrfs_add_leaf_ref(root, ref, shared); | ||
1520 | } | ||
1521 | WARN_ON(ret); | ||
1522 | btrfs_free_leaf_ref(root, ref); | ||
1523 | } | ||
1524 | out: | ||
1525 | return ret; | ||
1526 | } | ||
1527 | |||
1528 | int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
1529 | struct extent_buffer *orig_buf, struct extent_buffer *buf, | ||
1530 | u32 *nr_extents) | ||
1531 | { | ||
1532 | u64 bytenr; | ||
1533 | u64 ref_root; | ||
1534 | u64 orig_root; | ||
1535 | u64 ref_generation; | ||
1536 | u64 orig_generation; | ||
1537 | u32 nritems; | ||
1538 | u32 nr_file_extents = 0; | ||
1539 | struct btrfs_key key; | ||
1540 | struct btrfs_file_extent_item *fi; | ||
1541 | int i; | ||
1542 | int level; | ||
1543 | int ret = 0; | ||
1544 | int faili = 0; | ||
1545 | int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, | ||
1546 | u64, u64, u64, u64, u64, u64, u64, u64); | ||
1547 | |||
1548 | ref_root = btrfs_header_owner(buf); | ||
1549 | ref_generation = btrfs_header_generation(buf); | ||
1550 | orig_root = btrfs_header_owner(orig_buf); | ||
1551 | orig_generation = btrfs_header_generation(orig_buf); | ||
1552 | |||
1553 | nritems = btrfs_header_nritems(buf); | ||
1554 | level = btrfs_header_level(buf); | ||
1555 | |||
1556 | if (root->ref_cows) { | ||
1557 | process_func = __btrfs_inc_extent_ref; | ||
1558 | } else { | ||
1559 | if (level == 0 && | ||
1560 | root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) | ||
1561 | goto out; | ||
1562 | if (level != 0 && | ||
1563 | root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) | ||
1564 | goto out; | ||
1565 | process_func = __btrfs_update_extent_ref; | ||
1566 | } | ||
1567 | |||
1568 | for (i = 0; i < nritems; i++) { | ||
1569 | cond_resched(); | ||
1570 | if (level == 0) { | ||
1571 | btrfs_item_key_to_cpu(buf, &key, i); | ||
1572 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
1573 | continue; | ||
1574 | fi = btrfs_item_ptr(buf, i, | ||
1575 | struct btrfs_file_extent_item); | ||
1576 | if (btrfs_file_extent_type(buf, fi) == | ||
1577 | BTRFS_FILE_EXTENT_INLINE) | ||
1578 | continue; | ||
1579 | bytenr = btrfs_file_extent_disk_bytenr(buf, fi); | ||
1580 | if (bytenr == 0) | ||
1581 | continue; | ||
1582 | |||
1583 | nr_file_extents++; | ||
1584 | |||
1585 | ret = process_func(trans, root, bytenr, | ||
1586 | orig_buf->start, buf->start, | ||
1587 | orig_root, ref_root, | ||
1588 | orig_generation, ref_generation, | ||
1589 | key.objectid); | ||
1590 | |||
1591 | if (ret) { | ||
1592 | faili = i; | ||
1593 | WARN_ON(1); | ||
1594 | goto fail; | ||
1595 | } | ||
1596 | } else { | ||
1597 | bytenr = btrfs_node_blockptr(buf, i); | ||
1598 | ret = process_func(trans, root, bytenr, | ||
1599 | orig_buf->start, buf->start, | ||
1600 | orig_root, ref_root, | ||
1601 | orig_generation, ref_generation, | ||
1602 | level - 1); | ||
1603 | if (ret) { | ||
1604 | faili = i; | ||
1605 | WARN_ON(1); | ||
1606 | goto fail; | ||
1607 | } | ||
1608 | } | ||
1609 | } | ||
1610 | out: | ||
1611 | if (nr_extents) { | ||
1612 | if (level == 0) | ||
1613 | *nr_extents = nr_file_extents; | ||
1614 | else | ||
1615 | *nr_extents = nritems; | ||
1616 | } | ||
1617 | return 0; | ||
1618 | fail: | ||
1619 | WARN_ON(1); | ||
1620 | return ret; | ||
1621 | } | ||
1622 | |||
1623 | int btrfs_update_ref(struct btrfs_trans_handle *trans, | ||
1624 | struct btrfs_root *root, struct extent_buffer *orig_buf, | ||
1625 | struct extent_buffer *buf, int start_slot, int nr) | ||
1626 | |||
1627 | { | ||
1628 | u64 bytenr; | ||
1629 | u64 ref_root; | ||
1630 | u64 orig_root; | ||
1631 | u64 ref_generation; | ||
1632 | u64 orig_generation; | ||
1633 | struct btrfs_key key; | ||
1634 | struct btrfs_file_extent_item *fi; | ||
1635 | int i; | ||
1636 | int ret; | ||
1637 | int slot; | ||
1638 | int level; | ||
1639 | |||
1640 | BUG_ON(start_slot < 0); | ||
1641 | BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); | ||
1642 | |||
1643 | ref_root = btrfs_header_owner(buf); | ||
1644 | ref_generation = btrfs_header_generation(buf); | ||
1645 | orig_root = btrfs_header_owner(orig_buf); | ||
1646 | orig_generation = btrfs_header_generation(orig_buf); | ||
1647 | level = btrfs_header_level(buf); | ||
1648 | |||
1649 | if (!root->ref_cows) { | ||
1650 | if (level == 0 && | ||
1651 | root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) | ||
1652 | return 0; | ||
1653 | if (level != 0 && | ||
1654 | root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) | ||
1655 | return 0; | ||
1656 | } | ||
1657 | |||
1658 | for (i = 0, slot = start_slot; i < nr; i++, slot++) { | ||
1659 | cond_resched(); | ||
1660 | if (level == 0) { | ||
1661 | btrfs_item_key_to_cpu(buf, &key, slot); | ||
1662 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
1663 | continue; | ||
1664 | fi = btrfs_item_ptr(buf, slot, | ||
1665 | struct btrfs_file_extent_item); | ||
1666 | if (btrfs_file_extent_type(buf, fi) == | ||
1667 | BTRFS_FILE_EXTENT_INLINE) | ||
1668 | continue; | ||
1669 | bytenr = btrfs_file_extent_disk_bytenr(buf, fi); | ||
1670 | if (bytenr == 0) | ||
1671 | continue; | ||
1672 | ret = __btrfs_update_extent_ref(trans, root, bytenr, | ||
1673 | orig_buf->start, buf->start, | ||
1674 | orig_root, ref_root, | ||
1675 | orig_generation, ref_generation, | ||
1676 | key.objectid); | ||
1677 | if (ret) | ||
1678 | goto fail; | ||
1679 | } else { | ||
1680 | bytenr = btrfs_node_blockptr(buf, slot); | ||
1681 | ret = __btrfs_update_extent_ref(trans, root, bytenr, | ||
1682 | orig_buf->start, buf->start, | ||
1683 | orig_root, ref_root, | ||
1684 | orig_generation, ref_generation, | ||
1685 | level - 1); | ||
1686 | if (ret) | ||
1687 | goto fail; | ||
1688 | } | ||
1689 | } | ||
1690 | return 0; | ||
1691 | fail: | ||
1692 | WARN_ON(1); | ||
1693 | return -1; | ||
1694 | } | ||
1695 | |||
1696 | static int write_one_cache_group(struct btrfs_trans_handle *trans, | ||
1697 | struct btrfs_root *root, | ||
1698 | struct btrfs_path *path, | ||
1699 | struct btrfs_block_group_cache *cache) | ||
1700 | { | ||
1701 | int ret; | ||
1702 | int pending_ret; | ||
1703 | struct btrfs_root *extent_root = root->fs_info->extent_root; | ||
1704 | unsigned long bi; | ||
1705 | struct extent_buffer *leaf; | ||
1706 | |||
1707 | ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); | ||
1708 | if (ret < 0) | ||
1709 | goto fail; | ||
1710 | BUG_ON(ret); | ||
1711 | |||
1712 | leaf = path->nodes[0]; | ||
1713 | bi = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
1714 | write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); | ||
1715 | btrfs_mark_buffer_dirty(leaf); | ||
1716 | btrfs_release_path(extent_root, path); | ||
1717 | fail: | ||
1718 | finish_current_insert(trans, extent_root, 0); | ||
1719 | pending_ret = del_pending_extents(trans, extent_root, 0); | ||
1720 | if (ret) | ||
1721 | return ret; | ||
1722 | if (pending_ret) | ||
1723 | return pending_ret; | ||
1724 | return 0; | ||
1725 | |||
1726 | } | ||
1727 | |||
1728 | int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, | ||
1729 | struct btrfs_root *root) | ||
1730 | { | ||
1731 | struct btrfs_block_group_cache *cache, *entry; | ||
1732 | struct rb_node *n; | ||
1733 | int err = 0; | ||
1734 | int werr = 0; | ||
1735 | struct btrfs_path *path; | ||
1736 | u64 last = 0; | ||
1737 | |||
1738 | path = btrfs_alloc_path(); | ||
1739 | if (!path) | ||
1740 | return -ENOMEM; | ||
1741 | |||
1742 | while (1) { | ||
1743 | cache = NULL; | ||
1744 | spin_lock(&root->fs_info->block_group_cache_lock); | ||
1745 | for (n = rb_first(&root->fs_info->block_group_cache_tree); | ||
1746 | n; n = rb_next(n)) { | ||
1747 | entry = rb_entry(n, struct btrfs_block_group_cache, | ||
1748 | cache_node); | ||
1749 | if (entry->dirty) { | ||
1750 | cache = entry; | ||
1751 | break; | ||
1752 | } | ||
1753 | } | ||
1754 | spin_unlock(&root->fs_info->block_group_cache_lock); | ||
1755 | |||
1756 | if (!cache) | ||
1757 | break; | ||
1758 | |||
1759 | cache->dirty = 0; | ||
1760 | last += cache->key.offset; | ||
1761 | |||
1762 | err = write_one_cache_group(trans, root, | ||
1763 | path, cache); | ||
1764 | /* | ||
1765 | * if we fail to write the cache group, we want | ||
1766 | * to keep it marked dirty in hopes that a later | ||
1767 | * write will work | ||
1768 | */ | ||
1769 | if (err) { | ||
1770 | werr = err; | ||
1771 | continue; | ||
1772 | } | ||
1773 | } | ||
1774 | btrfs_free_path(path); | ||
1775 | return werr; | ||
1776 | } | ||
1777 | |||
1778 | int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) | ||
1779 | { | ||
1780 | struct btrfs_block_group_cache *block_group; | ||
1781 | int readonly = 0; | ||
1782 | |||
1783 | block_group = btrfs_lookup_block_group(root->fs_info, bytenr); | ||
1784 | if (!block_group || block_group->ro) | ||
1785 | readonly = 1; | ||
1786 | if (block_group) | ||
1787 | put_block_group(block_group); | ||
1788 | return readonly; | ||
1789 | } | ||
1790 | |||
1791 | static int update_space_info(struct btrfs_fs_info *info, u64 flags, | ||
1792 | u64 total_bytes, u64 bytes_used, | ||
1793 | struct btrfs_space_info **space_info) | ||
1794 | { | ||
1795 | struct btrfs_space_info *found; | ||
1796 | |||
1797 | found = __find_space_info(info, flags); | ||
1798 | if (found) { | ||
1799 | spin_lock(&found->lock); | ||
1800 | found->total_bytes += total_bytes; | ||
1801 | found->bytes_used += bytes_used; | ||
1802 | found->full = 0; | ||
1803 | spin_unlock(&found->lock); | ||
1804 | *space_info = found; | ||
1805 | return 0; | ||
1806 | } | ||
1807 | found = kzalloc(sizeof(*found), GFP_NOFS); | ||
1808 | if (!found) | ||
1809 | return -ENOMEM; | ||
1810 | |||
1811 | list_add(&found->list, &info->space_info); | ||
1812 | INIT_LIST_HEAD(&found->block_groups); | ||
1813 | init_rwsem(&found->groups_sem); | ||
1814 | spin_lock_init(&found->lock); | ||
1815 | found->flags = flags; | ||
1816 | found->total_bytes = total_bytes; | ||
1817 | found->bytes_used = bytes_used; | ||
1818 | found->bytes_pinned = 0; | ||
1819 | found->bytes_reserved = 0; | ||
1820 | found->bytes_readonly = 0; | ||
1821 | found->full = 0; | ||
1822 | found->force_alloc = 0; | ||
1823 | *space_info = found; | ||
1824 | return 0; | ||
1825 | } | ||
1826 | |||
1827 | static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | ||
1828 | { | ||
1829 | u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | | ||
1830 | BTRFS_BLOCK_GROUP_RAID1 | | ||
1831 | BTRFS_BLOCK_GROUP_RAID10 | | ||
1832 | BTRFS_BLOCK_GROUP_DUP); | ||
1833 | if (extra_flags) { | ||
1834 | if (flags & BTRFS_BLOCK_GROUP_DATA) | ||
1835 | fs_info->avail_data_alloc_bits |= extra_flags; | ||
1836 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | ||
1837 | fs_info->avail_metadata_alloc_bits |= extra_flags; | ||
1838 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | ||
1839 | fs_info->avail_system_alloc_bits |= extra_flags; | ||
1840 | } | ||
1841 | } | ||
1842 | |||
1843 | static void set_block_group_readonly(struct btrfs_block_group_cache *cache) | ||
1844 | { | ||
1845 | spin_lock(&cache->space_info->lock); | ||
1846 | spin_lock(&cache->lock); | ||
1847 | if (!cache->ro) { | ||
1848 | cache->space_info->bytes_readonly += cache->key.offset - | ||
1849 | btrfs_block_group_used(&cache->item); | ||
1850 | cache->ro = 1; | ||
1851 | } | ||
1852 | spin_unlock(&cache->lock); | ||
1853 | spin_unlock(&cache->space_info->lock); | ||
1854 | } | ||
1855 | |||
1856 | u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | ||
1857 | { | ||
1858 | u64 num_devices = root->fs_info->fs_devices->rw_devices; | ||
1859 | |||
1860 | if (num_devices == 1) | ||
1861 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | ||
1862 | if (num_devices < 4) | ||
1863 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; | ||
1864 | |||
1865 | if ((flags & BTRFS_BLOCK_GROUP_DUP) && | ||
1866 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | ||
1867 | BTRFS_BLOCK_GROUP_RAID10))) { | ||
1868 | flags &= ~BTRFS_BLOCK_GROUP_DUP; | ||
1869 | } | ||
1870 | |||
1871 | if ((flags & BTRFS_BLOCK_GROUP_RAID1) && | ||
1872 | (flags & BTRFS_BLOCK_GROUP_RAID10)) { | ||
1873 | flags &= ~BTRFS_BLOCK_GROUP_RAID1; | ||
1874 | } | ||
1875 | |||
1876 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | ||
1877 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | ||
1878 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | ||
1879 | (flags & BTRFS_BLOCK_GROUP_DUP))) | ||
1880 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | ||
1881 | return flags; | ||
1882 | } | ||
1883 | |||
1884 | static int do_chunk_alloc(struct btrfs_trans_handle *trans, | ||
1885 | struct btrfs_root *extent_root, u64 alloc_bytes, | ||
1886 | u64 flags, int force) | ||
1887 | { | ||
1888 | struct btrfs_space_info *space_info; | ||
1889 | u64 thresh; | ||
1890 | int ret = 0; | ||
1891 | |||
1892 | mutex_lock(&extent_root->fs_info->chunk_mutex); | ||
1893 | |||
1894 | flags = btrfs_reduce_alloc_profile(extent_root, flags); | ||
1895 | |||
1896 | space_info = __find_space_info(extent_root->fs_info, flags); | ||
1897 | if (!space_info) { | ||
1898 | ret = update_space_info(extent_root->fs_info, flags, | ||
1899 | 0, 0, &space_info); | ||
1900 | BUG_ON(ret); | ||
1901 | } | ||
1902 | BUG_ON(!space_info); | ||
1903 | |||
1904 | spin_lock(&space_info->lock); | ||
1905 | if (space_info->force_alloc) { | ||
1906 | force = 1; | ||
1907 | space_info->force_alloc = 0; | ||
1908 | } | ||
1909 | if (space_info->full) { | ||
1910 | spin_unlock(&space_info->lock); | ||
1911 | goto out; | ||
1912 | } | ||
1913 | |||
1914 | thresh = space_info->total_bytes - space_info->bytes_readonly; | ||
1915 | thresh = div_factor(thresh, 6); | ||
1916 | if (!force && | ||
1917 | (space_info->bytes_used + space_info->bytes_pinned + | ||
1918 | space_info->bytes_reserved + alloc_bytes) < thresh) { | ||
1919 | spin_unlock(&space_info->lock); | ||
1920 | goto out; | ||
1921 | } | ||
1922 | spin_unlock(&space_info->lock); | ||
1923 | |||
1924 | ret = btrfs_alloc_chunk(trans, extent_root, flags); | ||
1925 | if (ret) | ||
1926 | space_info->full = 1; | ||
1927 | out: | ||
1928 | mutex_unlock(&extent_root->fs_info->chunk_mutex); | ||
1929 | return ret; | ||
1930 | } | ||
1931 | |||
1932 | static int update_block_group(struct btrfs_trans_handle *trans, | ||
1933 | struct btrfs_root *root, | ||
1934 | u64 bytenr, u64 num_bytes, int alloc, | ||
1935 | int mark_free) | ||
1936 | { | ||
1937 | struct btrfs_block_group_cache *cache; | ||
1938 | struct btrfs_fs_info *info = root->fs_info; | ||
1939 | u64 total = num_bytes; | ||
1940 | u64 old_val; | ||
1941 | u64 byte_in_group; | ||
1942 | |||
1943 | while (total) { | ||
1944 | cache = btrfs_lookup_block_group(info, bytenr); | ||
1945 | if (!cache) | ||
1946 | return -1; | ||
1947 | byte_in_group = bytenr - cache->key.objectid; | ||
1948 | WARN_ON(byte_in_group > cache->key.offset); | ||
1949 | |||
1950 | spin_lock(&cache->space_info->lock); | ||
1951 | spin_lock(&cache->lock); | ||
1952 | cache->dirty = 1; | ||
1953 | old_val = btrfs_block_group_used(&cache->item); | ||
1954 | num_bytes = min(total, cache->key.offset - byte_in_group); | ||
1955 | if (alloc) { | ||
1956 | old_val += num_bytes; | ||
1957 | cache->space_info->bytes_used += num_bytes; | ||
1958 | if (cache->ro) | ||
1959 | cache->space_info->bytes_readonly -= num_bytes; | ||
1960 | btrfs_set_block_group_used(&cache->item, old_val); | ||
1961 | spin_unlock(&cache->lock); | ||
1962 | spin_unlock(&cache->space_info->lock); | ||
1963 | } else { | ||
1964 | old_val -= num_bytes; | ||
1965 | cache->space_info->bytes_used -= num_bytes; | ||
1966 | if (cache->ro) | ||
1967 | cache->space_info->bytes_readonly += num_bytes; | ||
1968 | btrfs_set_block_group_used(&cache->item, old_val); | ||
1969 | spin_unlock(&cache->lock); | ||
1970 | spin_unlock(&cache->space_info->lock); | ||
1971 | if (mark_free) { | ||
1972 | int ret; | ||
1973 | |||
1974 | ret = btrfs_discard_extent(root, bytenr, | ||
1975 | num_bytes); | ||
1976 | WARN_ON(ret); | ||
1977 | |||
1978 | ret = btrfs_add_free_space(cache, bytenr, | ||
1979 | num_bytes); | ||
1980 | WARN_ON(ret); | ||
1981 | } | ||
1982 | } | ||
1983 | put_block_group(cache); | ||
1984 | total -= num_bytes; | ||
1985 | bytenr += num_bytes; | ||
1986 | } | ||
1987 | return 0; | ||
1988 | } | ||
1989 | |||
1990 | static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) | ||
1991 | { | ||
1992 | struct btrfs_block_group_cache *cache; | ||
1993 | u64 bytenr; | ||
1994 | |||
1995 | cache = btrfs_lookup_first_block_group(root->fs_info, search_start); | ||
1996 | if (!cache) | ||
1997 | return 0; | ||
1998 | |||
1999 | bytenr = cache->key.objectid; | ||
2000 | put_block_group(cache); | ||
2001 | |||
2002 | return bytenr; | ||
2003 | } | ||
2004 | |||
2005 | int btrfs_update_pinned_extents(struct btrfs_root *root, | ||
2006 | u64 bytenr, u64 num, int pin) | ||
2007 | { | ||
2008 | u64 len; | ||
2009 | struct btrfs_block_group_cache *cache; | ||
2010 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2011 | |||
2012 | WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); | ||
2013 | if (pin) { | ||
2014 | set_extent_dirty(&fs_info->pinned_extents, | ||
2015 | bytenr, bytenr + num - 1, GFP_NOFS); | ||
2016 | } else { | ||
2017 | clear_extent_dirty(&fs_info->pinned_extents, | ||
2018 | bytenr, bytenr + num - 1, GFP_NOFS); | ||
2019 | } | ||
2020 | while (num > 0) { | ||
2021 | cache = btrfs_lookup_block_group(fs_info, bytenr); | ||
2022 | BUG_ON(!cache); | ||
2023 | len = min(num, cache->key.offset - | ||
2024 | (bytenr - cache->key.objectid)); | ||
2025 | if (pin) { | ||
2026 | spin_lock(&cache->space_info->lock); | ||
2027 | spin_lock(&cache->lock); | ||
2028 | cache->pinned += len; | ||
2029 | cache->space_info->bytes_pinned += len; | ||
2030 | spin_unlock(&cache->lock); | ||
2031 | spin_unlock(&cache->space_info->lock); | ||
2032 | fs_info->total_pinned += len; | ||
2033 | } else { | ||
2034 | spin_lock(&cache->space_info->lock); | ||
2035 | spin_lock(&cache->lock); | ||
2036 | cache->pinned -= len; | ||
2037 | cache->space_info->bytes_pinned -= len; | ||
2038 | spin_unlock(&cache->lock); | ||
2039 | spin_unlock(&cache->space_info->lock); | ||
2040 | fs_info->total_pinned -= len; | ||
2041 | if (cache->cached) | ||
2042 | btrfs_add_free_space(cache, bytenr, len); | ||
2043 | } | ||
2044 | put_block_group(cache); | ||
2045 | bytenr += len; | ||
2046 | num -= len; | ||
2047 | } | ||
2048 | return 0; | ||
2049 | } | ||
2050 | |||
2051 | static int update_reserved_extents(struct btrfs_root *root, | ||
2052 | u64 bytenr, u64 num, int reserve) | ||
2053 | { | ||
2054 | u64 len; | ||
2055 | struct btrfs_block_group_cache *cache; | ||
2056 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2057 | |||
2058 | while (num > 0) { | ||
2059 | cache = btrfs_lookup_block_group(fs_info, bytenr); | ||
2060 | BUG_ON(!cache); | ||
2061 | len = min(num, cache->key.offset - | ||
2062 | (bytenr - cache->key.objectid)); | ||
2063 | |||
2064 | spin_lock(&cache->space_info->lock); | ||
2065 | spin_lock(&cache->lock); | ||
2066 | if (reserve) { | ||
2067 | cache->reserved += len; | ||
2068 | cache->space_info->bytes_reserved += len; | ||
2069 | } else { | ||
2070 | cache->reserved -= len; | ||
2071 | cache->space_info->bytes_reserved -= len; | ||
2072 | } | ||
2073 | spin_unlock(&cache->lock); | ||
2074 | spin_unlock(&cache->space_info->lock); | ||
2075 | put_block_group(cache); | ||
2076 | bytenr += len; | ||
2077 | num -= len; | ||
2078 | } | ||
2079 | return 0; | ||
2080 | } | ||
2081 | |||
2082 | int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) | ||
2083 | { | ||
2084 | u64 last = 0; | ||
2085 | u64 start; | ||
2086 | u64 end; | ||
2087 | struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; | ||
2088 | int ret; | ||
2089 | |||
2090 | mutex_lock(&root->fs_info->pinned_mutex); | ||
2091 | while (1) { | ||
2092 | ret = find_first_extent_bit(pinned_extents, last, | ||
2093 | &start, &end, EXTENT_DIRTY); | ||
2094 | if (ret) | ||
2095 | break; | ||
2096 | set_extent_dirty(copy, start, end, GFP_NOFS); | ||
2097 | last = end + 1; | ||
2098 | } | ||
2099 | mutex_unlock(&root->fs_info->pinned_mutex); | ||
2100 | return 0; | ||
2101 | } | ||
2102 | |||
2103 | int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | ||
2104 | struct btrfs_root *root, | ||
2105 | struct extent_io_tree *unpin) | ||
2106 | { | ||
2107 | u64 start; | ||
2108 | u64 end; | ||
2109 | int ret; | ||
2110 | |||
2111 | mutex_lock(&root->fs_info->pinned_mutex); | ||
2112 | while (1) { | ||
2113 | ret = find_first_extent_bit(unpin, 0, &start, &end, | ||
2114 | EXTENT_DIRTY); | ||
2115 | if (ret) | ||
2116 | break; | ||
2117 | |||
2118 | ret = btrfs_discard_extent(root, start, end + 1 - start); | ||
2119 | |||
2120 | btrfs_update_pinned_extents(root, start, end + 1 - start, 0); | ||
2121 | clear_extent_dirty(unpin, start, end, GFP_NOFS); | ||
2122 | |||
2123 | if (need_resched()) { | ||
2124 | mutex_unlock(&root->fs_info->pinned_mutex); | ||
2125 | cond_resched(); | ||
2126 | mutex_lock(&root->fs_info->pinned_mutex); | ||
2127 | } | ||
2128 | } | ||
2129 | mutex_unlock(&root->fs_info->pinned_mutex); | ||
2130 | return ret; | ||
2131 | } | ||
2132 | |||
2133 | static int finish_current_insert(struct btrfs_trans_handle *trans, | ||
2134 | struct btrfs_root *extent_root, int all) | ||
2135 | { | ||
2136 | u64 start; | ||
2137 | u64 end; | ||
2138 | u64 priv; | ||
2139 | u64 search = 0; | ||
2140 | u64 skipped = 0; | ||
2141 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
2142 | struct btrfs_path *path; | ||
2143 | struct pending_extent_op *extent_op, *tmp; | ||
2144 | struct list_head insert_list, update_list; | ||
2145 | int ret; | ||
2146 | int num_inserts = 0, max_inserts; | ||
2147 | |||
2148 | path = btrfs_alloc_path(); | ||
2149 | INIT_LIST_HEAD(&insert_list); | ||
2150 | INIT_LIST_HEAD(&update_list); | ||
2151 | |||
2152 | max_inserts = extent_root->leafsize / | ||
2153 | (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + | ||
2154 | sizeof(struct btrfs_extent_ref) + | ||
2155 | sizeof(struct btrfs_extent_item)); | ||
2156 | again: | ||
2157 | mutex_lock(&info->extent_ins_mutex); | ||
2158 | while (1) { | ||
2159 | ret = find_first_extent_bit(&info->extent_ins, search, &start, | ||
2160 | &end, EXTENT_WRITEBACK); | ||
2161 | if (ret) { | ||
2162 | if (skipped && all && !num_inserts) { | ||
2163 | skipped = 0; | ||
2164 | search = 0; | ||
2165 | continue; | ||
2166 | } | ||
2167 | mutex_unlock(&info->extent_ins_mutex); | ||
2168 | break; | ||
2169 | } | ||
2170 | |||
2171 | ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); | ||
2172 | if (!ret) { | ||
2173 | skipped = 1; | ||
2174 | search = end + 1; | ||
2175 | if (need_resched()) { | ||
2176 | mutex_unlock(&info->extent_ins_mutex); | ||
2177 | cond_resched(); | ||
2178 | mutex_lock(&info->extent_ins_mutex); | ||
2179 | } | ||
2180 | continue; | ||
2181 | } | ||
2182 | |||
2183 | ret = get_state_private(&info->extent_ins, start, &priv); | ||
2184 | BUG_ON(ret); | ||
2185 | extent_op = (struct pending_extent_op *)(unsigned long) priv; | ||
2186 | |||
2187 | if (extent_op->type == PENDING_EXTENT_INSERT) { | ||
2188 | num_inserts++; | ||
2189 | list_add_tail(&extent_op->list, &insert_list); | ||
2190 | search = end + 1; | ||
2191 | if (num_inserts == max_inserts) { | ||
2192 | mutex_unlock(&info->extent_ins_mutex); | ||
2193 | break; | ||
2194 | } | ||
2195 | } else if (extent_op->type == PENDING_BACKREF_UPDATE) { | ||
2196 | list_add_tail(&extent_op->list, &update_list); | ||
2197 | search = end + 1; | ||
2198 | } else { | ||
2199 | BUG(); | ||
2200 | } | ||
2201 | } | ||
2202 | |||
2203 | /* | ||
2204 | * process the update list, clear the writeback bit for it, and if | ||
2205 | * somebody marked this thing for deletion then just unlock it and be | ||
2206 | * done, the free_extents will handle it | ||
2207 | */ | ||
2208 | mutex_lock(&info->extent_ins_mutex); | ||
2209 | list_for_each_entry_safe(extent_op, tmp, &update_list, list) { | ||
2210 | clear_extent_bits(&info->extent_ins, extent_op->bytenr, | ||
2211 | extent_op->bytenr + extent_op->num_bytes - 1, | ||
2212 | EXTENT_WRITEBACK, GFP_NOFS); | ||
2213 | if (extent_op->del) { | ||
2214 | list_del_init(&extent_op->list); | ||
2215 | unlock_extent(&info->extent_ins, extent_op->bytenr, | ||
2216 | extent_op->bytenr + extent_op->num_bytes | ||
2217 | - 1, GFP_NOFS); | ||
2218 | kfree(extent_op); | ||
2219 | } | ||
2220 | } | ||
2221 | mutex_unlock(&info->extent_ins_mutex); | ||
2222 | |||
2223 | /* | ||
2224 | * still have things left on the update list, go ahead an update | ||
2225 | * everything | ||
2226 | */ | ||
2227 | if (!list_empty(&update_list)) { | ||
2228 | ret = update_backrefs(trans, extent_root, path, &update_list); | ||
2229 | BUG_ON(ret); | ||
2230 | } | ||
2231 | |||
2232 | /* | ||
2233 | * if no inserts need to be done, but we skipped some extents and we | ||
2234 | * need to make sure everything is cleaned then reset everything and | ||
2235 | * go back to the beginning | ||
2236 | */ | ||
2237 | if (!num_inserts && all && skipped) { | ||
2238 | search = 0; | ||
2239 | skipped = 0; | ||
2240 | INIT_LIST_HEAD(&update_list); | ||
2241 | INIT_LIST_HEAD(&insert_list); | ||
2242 | goto again; | ||
2243 | } else if (!num_inserts) { | ||
2244 | goto out; | ||
2245 | } | ||
2246 | |||
2247 | /* | ||
2248 | * process the insert extents list. Again if we are deleting this | ||
2249 | * extent, then just unlock it, pin down the bytes if need be, and be | ||
2250 | * done with it. Saves us from having to actually insert the extent | ||
2251 | * into the tree and then subsequently come along and delete it | ||
2252 | */ | ||
2253 | mutex_lock(&info->extent_ins_mutex); | ||
2254 | list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { | ||
2255 | clear_extent_bits(&info->extent_ins, extent_op->bytenr, | ||
2256 | extent_op->bytenr + extent_op->num_bytes - 1, | ||
2257 | EXTENT_WRITEBACK, GFP_NOFS); | ||
2258 | if (extent_op->del) { | ||
2259 | u64 used; | ||
2260 | list_del_init(&extent_op->list); | ||
2261 | unlock_extent(&info->extent_ins, extent_op->bytenr, | ||
2262 | extent_op->bytenr + extent_op->num_bytes | ||
2263 | - 1, GFP_NOFS); | ||
2264 | |||
2265 | mutex_lock(&extent_root->fs_info->pinned_mutex); | ||
2266 | ret = pin_down_bytes(trans, extent_root, | ||
2267 | extent_op->bytenr, | ||
2268 | extent_op->num_bytes, 0); | ||
2269 | mutex_unlock(&extent_root->fs_info->pinned_mutex); | ||
2270 | |||
2271 | spin_lock(&info->delalloc_lock); | ||
2272 | used = btrfs_super_bytes_used(&info->super_copy); | ||
2273 | btrfs_set_super_bytes_used(&info->super_copy, | ||
2274 | used - extent_op->num_bytes); | ||
2275 | used = btrfs_root_used(&extent_root->root_item); | ||
2276 | btrfs_set_root_used(&extent_root->root_item, | ||
2277 | used - extent_op->num_bytes); | ||
2278 | spin_unlock(&info->delalloc_lock); | ||
2279 | |||
2280 | ret = update_block_group(trans, extent_root, | ||
2281 | extent_op->bytenr, | ||
2282 | extent_op->num_bytes, | ||
2283 | 0, ret > 0); | ||
2284 | BUG_ON(ret); | ||
2285 | kfree(extent_op); | ||
2286 | num_inserts--; | ||
2287 | } | ||
2288 | } | ||
2289 | mutex_unlock(&info->extent_ins_mutex); | ||
2290 | |||
2291 | ret = insert_extents(trans, extent_root, path, &insert_list, | ||
2292 | num_inserts); | ||
2293 | BUG_ON(ret); | ||
2294 | |||
2295 | /* | ||
2296 | * if we broke out of the loop in order to insert stuff because we hit | ||
2297 | * the maximum number of inserts at a time we can handle, then loop | ||
2298 | * back and pick up where we left off | ||
2299 | */ | ||
2300 | if (num_inserts == max_inserts) { | ||
2301 | INIT_LIST_HEAD(&insert_list); | ||
2302 | INIT_LIST_HEAD(&update_list); | ||
2303 | num_inserts = 0; | ||
2304 | goto again; | ||
2305 | } | ||
2306 | |||
2307 | /* | ||
2308 | * again, if we need to make absolutely sure there are no more pending | ||
2309 | * extent operations left and we know that we skipped some, go back to | ||
2310 | * the beginning and do it all again | ||
2311 | */ | ||
2312 | if (all && skipped) { | ||
2313 | INIT_LIST_HEAD(&insert_list); | ||
2314 | INIT_LIST_HEAD(&update_list); | ||
2315 | search = 0; | ||
2316 | skipped = 0; | ||
2317 | num_inserts = 0; | ||
2318 | goto again; | ||
2319 | } | ||
2320 | out: | ||
2321 | btrfs_free_path(path); | ||
2322 | return 0; | ||
2323 | } | ||
2324 | |||
2325 | static int pin_down_bytes(struct btrfs_trans_handle *trans, | ||
2326 | struct btrfs_root *root, | ||
2327 | u64 bytenr, u64 num_bytes, int is_data) | ||
2328 | { | ||
2329 | int err = 0; | ||
2330 | struct extent_buffer *buf; | ||
2331 | |||
2332 | if (is_data) | ||
2333 | goto pinit; | ||
2334 | |||
2335 | buf = btrfs_find_tree_block(root, bytenr, num_bytes); | ||
2336 | if (!buf) | ||
2337 | goto pinit; | ||
2338 | |||
2339 | /* we can reuse a block if it hasn't been written | ||
2340 | * and it is from this transaction. We can't | ||
2341 | * reuse anything from the tree log root because | ||
2342 | * it has tiny sub-transactions. | ||
2343 | */ | ||
2344 | if (btrfs_buffer_uptodate(buf, 0) && | ||
2345 | btrfs_try_tree_lock(buf)) { | ||
2346 | u64 header_owner = btrfs_header_owner(buf); | ||
2347 | u64 header_transid = btrfs_header_generation(buf); | ||
2348 | if (header_owner != BTRFS_TREE_LOG_OBJECTID && | ||
2349 | header_owner != BTRFS_TREE_RELOC_OBJECTID && | ||
2350 | header_transid == trans->transid && | ||
2351 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { | ||
2352 | clean_tree_block(NULL, root, buf); | ||
2353 | btrfs_tree_unlock(buf); | ||
2354 | free_extent_buffer(buf); | ||
2355 | return 1; | ||
2356 | } | ||
2357 | btrfs_tree_unlock(buf); | ||
2358 | } | ||
2359 | free_extent_buffer(buf); | ||
2360 | pinit: | ||
2361 | btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); | ||
2362 | |||
2363 | BUG_ON(err < 0); | ||
2364 | return 0; | ||
2365 | } | ||
2366 | |||
2367 | /* | ||
2368 | * remove an extent from the root, returns 0 on success | ||
2369 | */ | ||
2370 | static int __free_extent(struct btrfs_trans_handle *trans, | ||
2371 | struct btrfs_root *root, | ||
2372 | u64 bytenr, u64 num_bytes, u64 parent, | ||
2373 | u64 root_objectid, u64 ref_generation, | ||
2374 | u64 owner_objectid, int pin, int mark_free) | ||
2375 | { | ||
2376 | struct btrfs_path *path; | ||
2377 | struct btrfs_key key; | ||
2378 | struct btrfs_fs_info *info = root->fs_info; | ||
2379 | struct btrfs_root *extent_root = info->extent_root; | ||
2380 | struct extent_buffer *leaf; | ||
2381 | int ret; | ||
2382 | int extent_slot = 0; | ||
2383 | int found_extent = 0; | ||
2384 | int num_to_del = 1; | ||
2385 | struct btrfs_extent_item *ei; | ||
2386 | u32 refs; | ||
2387 | |||
2388 | key.objectid = bytenr; | ||
2389 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); | ||
2390 | key.offset = num_bytes; | ||
2391 | path = btrfs_alloc_path(); | ||
2392 | if (!path) | ||
2393 | return -ENOMEM; | ||
2394 | |||
2395 | path->reada = 1; | ||
2396 | ret = lookup_extent_backref(trans, extent_root, path, | ||
2397 | bytenr, parent, root_objectid, | ||
2398 | ref_generation, owner_objectid, 1); | ||
2399 | if (ret == 0) { | ||
2400 | struct btrfs_key found_key; | ||
2401 | extent_slot = path->slots[0]; | ||
2402 | while (extent_slot > 0) { | ||
2403 | extent_slot--; | ||
2404 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
2405 | extent_slot); | ||
2406 | if (found_key.objectid != bytenr) | ||
2407 | break; | ||
2408 | if (found_key.type == BTRFS_EXTENT_ITEM_KEY && | ||
2409 | found_key.offset == num_bytes) { | ||
2410 | found_extent = 1; | ||
2411 | break; | ||
2412 | } | ||
2413 | if (path->slots[0] - extent_slot > 5) | ||
2414 | break; | ||
2415 | } | ||
2416 | if (!found_extent) { | ||
2417 | ret = remove_extent_backref(trans, extent_root, path); | ||
2418 | BUG_ON(ret); | ||
2419 | btrfs_release_path(extent_root, path); | ||
2420 | ret = btrfs_search_slot(trans, extent_root, | ||
2421 | &key, path, -1, 1); | ||
2422 | if (ret) { | ||
2423 | printk(KERN_ERR "umm, got %d back from search" | ||
2424 | ", was looking for %llu\n", ret, | ||
2425 | (unsigned long long)bytenr); | ||
2426 | btrfs_print_leaf(extent_root, path->nodes[0]); | ||
2427 | } | ||
2428 | BUG_ON(ret); | ||
2429 | extent_slot = path->slots[0]; | ||
2430 | } | ||
2431 | } else { | ||
2432 | btrfs_print_leaf(extent_root, path->nodes[0]); | ||
2433 | WARN_ON(1); | ||
2434 | printk(KERN_ERR "btrfs unable to find ref byte nr %llu " | ||
2435 | "root %llu gen %llu owner %llu\n", | ||
2436 | (unsigned long long)bytenr, | ||
2437 | (unsigned long long)root_objectid, | ||
2438 | (unsigned long long)ref_generation, | ||
2439 | (unsigned long long)owner_objectid); | ||
2440 | } | ||
2441 | |||
2442 | leaf = path->nodes[0]; | ||
2443 | ei = btrfs_item_ptr(leaf, extent_slot, | ||
2444 | struct btrfs_extent_item); | ||
2445 | refs = btrfs_extent_refs(leaf, ei); | ||
2446 | BUG_ON(refs == 0); | ||
2447 | refs -= 1; | ||
2448 | btrfs_set_extent_refs(leaf, ei, refs); | ||
2449 | |||
2450 | btrfs_mark_buffer_dirty(leaf); | ||
2451 | |||
2452 | if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { | ||
2453 | struct btrfs_extent_ref *ref; | ||
2454 | ref = btrfs_item_ptr(leaf, path->slots[0], | ||
2455 | struct btrfs_extent_ref); | ||
2456 | BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); | ||
2457 | /* if the back ref and the extent are next to each other | ||
2458 | * they get deleted below in one shot | ||
2459 | */ | ||
2460 | path->slots[0] = extent_slot; | ||
2461 | num_to_del = 2; | ||
2462 | } else if (found_extent) { | ||
2463 | /* otherwise delete the extent back ref */ | ||
2464 | ret = remove_extent_backref(trans, extent_root, path); | ||
2465 | BUG_ON(ret); | ||
2466 | /* if refs are 0, we need to setup the path for deletion */ | ||
2467 | if (refs == 0) { | ||
2468 | btrfs_release_path(extent_root, path); | ||
2469 | ret = btrfs_search_slot(trans, extent_root, &key, path, | ||
2470 | -1, 1); | ||
2471 | BUG_ON(ret); | ||
2472 | } | ||
2473 | } | ||
2474 | |||
2475 | if (refs == 0) { | ||
2476 | u64 super_used; | ||
2477 | u64 root_used; | ||
2478 | |||
2479 | if (pin) { | ||
2480 | mutex_lock(&root->fs_info->pinned_mutex); | ||
2481 | ret = pin_down_bytes(trans, root, bytenr, num_bytes, | ||
2482 | owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); | ||
2483 | mutex_unlock(&root->fs_info->pinned_mutex); | ||
2484 | if (ret > 0) | ||
2485 | mark_free = 1; | ||
2486 | BUG_ON(ret < 0); | ||
2487 | } | ||
2488 | /* block accounting for super block */ | ||
2489 | spin_lock(&info->delalloc_lock); | ||
2490 | super_used = btrfs_super_bytes_used(&info->super_copy); | ||
2491 | btrfs_set_super_bytes_used(&info->super_copy, | ||
2492 | super_used - num_bytes); | ||
2493 | |||
2494 | /* block accounting for root item */ | ||
2495 | root_used = btrfs_root_used(&root->root_item); | ||
2496 | btrfs_set_root_used(&root->root_item, | ||
2497 | root_used - num_bytes); | ||
2498 | spin_unlock(&info->delalloc_lock); | ||
2499 | ret = btrfs_del_items(trans, extent_root, path, path->slots[0], | ||
2500 | num_to_del); | ||
2501 | BUG_ON(ret); | ||
2502 | btrfs_release_path(extent_root, path); | ||
2503 | |||
2504 | if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { | ||
2505 | ret = btrfs_del_csums(trans, root, bytenr, num_bytes); | ||
2506 | BUG_ON(ret); | ||
2507 | } | ||
2508 | |||
2509 | ret = update_block_group(trans, root, bytenr, num_bytes, 0, | ||
2510 | mark_free); | ||
2511 | BUG_ON(ret); | ||
2512 | } | ||
2513 | btrfs_free_path(path); | ||
2514 | finish_current_insert(trans, extent_root, 0); | ||
2515 | return ret; | ||
2516 | } | ||
2517 | |||
2518 | /* | ||
2519 | * find all the blocks marked as pending in the radix tree and remove | ||
2520 | * them from the extent map | ||
2521 | */ | ||
2522 | static int del_pending_extents(struct btrfs_trans_handle *trans, | ||
2523 | struct btrfs_root *extent_root, int all) | ||
2524 | { | ||
2525 | int ret; | ||
2526 | int err = 0; | ||
2527 | u64 start; | ||
2528 | u64 end; | ||
2529 | u64 priv; | ||
2530 | u64 search = 0; | ||
2531 | int nr = 0, skipped = 0; | ||
2532 | struct extent_io_tree *pending_del; | ||
2533 | struct extent_io_tree *extent_ins; | ||
2534 | struct pending_extent_op *extent_op; | ||
2535 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
2536 | struct list_head delete_list; | ||
2537 | |||
2538 | INIT_LIST_HEAD(&delete_list); | ||
2539 | extent_ins = &extent_root->fs_info->extent_ins; | ||
2540 | pending_del = &extent_root->fs_info->pending_del; | ||
2541 | |||
2542 | again: | ||
2543 | mutex_lock(&info->extent_ins_mutex); | ||
2544 | while (1) { | ||
2545 | ret = find_first_extent_bit(pending_del, search, &start, &end, | ||
2546 | EXTENT_WRITEBACK); | ||
2547 | if (ret) { | ||
2548 | if (all && skipped && !nr) { | ||
2549 | search = 0; | ||
2550 | continue; | ||
2551 | } | ||
2552 | mutex_unlock(&info->extent_ins_mutex); | ||
2553 | break; | ||
2554 | } | ||
2555 | |||
2556 | ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); | ||
2557 | if (!ret) { | ||
2558 | search = end+1; | ||
2559 | skipped = 1; | ||
2560 | |||
2561 | if (need_resched()) { | ||
2562 | mutex_unlock(&info->extent_ins_mutex); | ||
2563 | cond_resched(); | ||
2564 | mutex_lock(&info->extent_ins_mutex); | ||
2565 | } | ||
2566 | |||
2567 | continue; | ||
2568 | } | ||
2569 | BUG_ON(ret < 0); | ||
2570 | |||
2571 | ret = get_state_private(pending_del, start, &priv); | ||
2572 | BUG_ON(ret); | ||
2573 | extent_op = (struct pending_extent_op *)(unsigned long)priv; | ||
2574 | |||
2575 | clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, | ||
2576 | GFP_NOFS); | ||
2577 | if (!test_range_bit(extent_ins, start, end, | ||
2578 | EXTENT_WRITEBACK, 0)) { | ||
2579 | list_add_tail(&extent_op->list, &delete_list); | ||
2580 | nr++; | ||
2581 | } else { | ||
2582 | kfree(extent_op); | ||
2583 | |||
2584 | ret = get_state_private(&info->extent_ins, start, | ||
2585 | &priv); | ||
2586 | BUG_ON(ret); | ||
2587 | extent_op = (struct pending_extent_op *) | ||
2588 | (unsigned long)priv; | ||
2589 | |||
2590 | clear_extent_bits(&info->extent_ins, start, end, | ||
2591 | EXTENT_WRITEBACK, GFP_NOFS); | ||
2592 | |||
2593 | if (extent_op->type == PENDING_BACKREF_UPDATE) { | ||
2594 | list_add_tail(&extent_op->list, &delete_list); | ||
2595 | search = end + 1; | ||
2596 | nr++; | ||
2597 | continue; | ||
2598 | } | ||
2599 | |||
2600 | mutex_lock(&extent_root->fs_info->pinned_mutex); | ||
2601 | ret = pin_down_bytes(trans, extent_root, start, | ||
2602 | end + 1 - start, 0); | ||
2603 | mutex_unlock(&extent_root->fs_info->pinned_mutex); | ||
2604 | |||
2605 | ret = update_block_group(trans, extent_root, start, | ||
2606 | end + 1 - start, 0, ret > 0); | ||
2607 | |||
2608 | unlock_extent(extent_ins, start, end, GFP_NOFS); | ||
2609 | BUG_ON(ret); | ||
2610 | kfree(extent_op); | ||
2611 | } | ||
2612 | if (ret) | ||
2613 | err = ret; | ||
2614 | |||
2615 | search = end + 1; | ||
2616 | |||
2617 | if (need_resched()) { | ||
2618 | mutex_unlock(&info->extent_ins_mutex); | ||
2619 | cond_resched(); | ||
2620 | mutex_lock(&info->extent_ins_mutex); | ||
2621 | } | ||
2622 | } | ||
2623 | |||
2624 | if (nr) { | ||
2625 | ret = free_extents(trans, extent_root, &delete_list); | ||
2626 | BUG_ON(ret); | ||
2627 | } | ||
2628 | |||
2629 | if (all && skipped) { | ||
2630 | INIT_LIST_HEAD(&delete_list); | ||
2631 | search = 0; | ||
2632 | nr = 0; | ||
2633 | goto again; | ||
2634 | } | ||
2635 | |||
2636 | return err; | ||
2637 | } | ||
2638 | |||
2639 | /* | ||
2640 | * remove an extent from the root, returns 0 on success | ||
2641 | */ | ||
2642 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | ||
2643 | struct btrfs_root *root, | ||
2644 | u64 bytenr, u64 num_bytes, u64 parent, | ||
2645 | u64 root_objectid, u64 ref_generation, | ||
2646 | u64 owner_objectid, int pin) | ||
2647 | { | ||
2648 | struct btrfs_root *extent_root = root->fs_info->extent_root; | ||
2649 | int pending_ret; | ||
2650 | int ret; | ||
2651 | |||
2652 | WARN_ON(num_bytes < root->sectorsize); | ||
2653 | if (root == extent_root) { | ||
2654 | struct pending_extent_op *extent_op = NULL; | ||
2655 | |||
2656 | mutex_lock(&root->fs_info->extent_ins_mutex); | ||
2657 | if (test_range_bit(&root->fs_info->extent_ins, bytenr, | ||
2658 | bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { | ||
2659 | u64 priv; | ||
2660 | ret = get_state_private(&root->fs_info->extent_ins, | ||
2661 | bytenr, &priv); | ||
2662 | BUG_ON(ret); | ||
2663 | extent_op = (struct pending_extent_op *) | ||
2664 | (unsigned long)priv; | ||
2665 | |||
2666 | extent_op->del = 1; | ||
2667 | if (extent_op->type == PENDING_EXTENT_INSERT) { | ||
2668 | mutex_unlock(&root->fs_info->extent_ins_mutex); | ||
2669 | return 0; | ||
2670 | } | ||
2671 | } | ||
2672 | |||
2673 | if (extent_op) { | ||
2674 | ref_generation = extent_op->orig_generation; | ||
2675 | parent = extent_op->orig_parent; | ||
2676 | } | ||
2677 | |||
2678 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | ||
2679 | BUG_ON(!extent_op); | ||
2680 | |||
2681 | extent_op->type = PENDING_EXTENT_DELETE; | ||
2682 | extent_op->bytenr = bytenr; | ||
2683 | extent_op->num_bytes = num_bytes; | ||
2684 | extent_op->parent = parent; | ||
2685 | extent_op->orig_parent = parent; | ||
2686 | extent_op->generation = ref_generation; | ||
2687 | extent_op->orig_generation = ref_generation; | ||
2688 | extent_op->level = (int)owner_objectid; | ||
2689 | INIT_LIST_HEAD(&extent_op->list); | ||
2690 | extent_op->del = 0; | ||
2691 | |||
2692 | set_extent_bits(&root->fs_info->pending_del, | ||
2693 | bytenr, bytenr + num_bytes - 1, | ||
2694 | EXTENT_WRITEBACK, GFP_NOFS); | ||
2695 | set_state_private(&root->fs_info->pending_del, | ||
2696 | bytenr, (unsigned long)extent_op); | ||
2697 | mutex_unlock(&root->fs_info->extent_ins_mutex); | ||
2698 | return 0; | ||
2699 | } | ||
2700 | /* if metadata always pin */ | ||
2701 | if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { | ||
2702 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { | ||
2703 | struct btrfs_block_group_cache *cache; | ||
2704 | |||
2705 | /* btrfs_free_reserved_extent */ | ||
2706 | cache = btrfs_lookup_block_group(root->fs_info, bytenr); | ||
2707 | BUG_ON(!cache); | ||
2708 | btrfs_add_free_space(cache, bytenr, num_bytes); | ||
2709 | put_block_group(cache); | ||
2710 | update_reserved_extents(root, bytenr, num_bytes, 0); | ||
2711 | return 0; | ||
2712 | } | ||
2713 | pin = 1; | ||
2714 | } | ||
2715 | |||
2716 | /* if data pin when any transaction has committed this */ | ||
2717 | if (ref_generation != trans->transid) | ||
2718 | pin = 1; | ||
2719 | |||
2720 | ret = __free_extent(trans, root, bytenr, num_bytes, parent, | ||
2721 | root_objectid, ref_generation, | ||
2722 | owner_objectid, pin, pin == 0); | ||
2723 | |||
2724 | finish_current_insert(trans, root->fs_info->extent_root, 0); | ||
2725 | pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); | ||
2726 | return ret ? ret : pending_ret; | ||
2727 | } | ||
2728 | |||
2729 | int btrfs_free_extent(struct btrfs_trans_handle *trans, | ||
2730 | struct btrfs_root *root, | ||
2731 | u64 bytenr, u64 num_bytes, u64 parent, | ||
2732 | u64 root_objectid, u64 ref_generation, | ||
2733 | u64 owner_objectid, int pin) | ||
2734 | { | ||
2735 | int ret; | ||
2736 | |||
2737 | ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, | ||
2738 | root_objectid, ref_generation, | ||
2739 | owner_objectid, pin); | ||
2740 | return ret; | ||
2741 | } | ||
2742 | |||
2743 | static u64 stripe_align(struct btrfs_root *root, u64 val) | ||
2744 | { | ||
2745 | u64 mask = ((u64)root->stripesize - 1); | ||
2746 | u64 ret = (val + mask) & ~mask; | ||
2747 | return ret; | ||
2748 | } | ||
2749 | |||
2750 | /* | ||
2751 | * walks the btree of allocated extents and find a hole of a given size. | ||
2752 | * The key ins is changed to record the hole: | ||
2753 | * ins->objectid == block start | ||
2754 | * ins->flags = BTRFS_EXTENT_ITEM_KEY | ||
2755 | * ins->offset == number of blocks | ||
2756 | * Any available blocks before search_start are skipped. | ||
2757 | */ | ||
2758 | static noinline int find_free_extent(struct btrfs_trans_handle *trans, | ||
2759 | struct btrfs_root *orig_root, | ||
2760 | u64 num_bytes, u64 empty_size, | ||
2761 | u64 search_start, u64 search_end, | ||
2762 | u64 hint_byte, struct btrfs_key *ins, | ||
2763 | u64 exclude_start, u64 exclude_nr, | ||
2764 | int data) | ||
2765 | { | ||
2766 | int ret = 0; | ||
2767 | struct btrfs_root *root = orig_root->fs_info->extent_root; | ||
2768 | u64 total_needed = num_bytes; | ||
2769 | u64 *last_ptr = NULL; | ||
2770 | u64 last_wanted = 0; | ||
2771 | struct btrfs_block_group_cache *block_group = NULL; | ||
2772 | int chunk_alloc_done = 0; | ||
2773 | int empty_cluster = 2 * 1024 * 1024; | ||
2774 | int allowed_chunk_alloc = 0; | ||
2775 | struct list_head *head = NULL, *cur = NULL; | ||
2776 | int loop = 0; | ||
2777 | int extra_loop = 0; | ||
2778 | struct btrfs_space_info *space_info; | ||
2779 | |||
2780 | WARN_ON(num_bytes < root->sectorsize); | ||
2781 | btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); | ||
2782 | ins->objectid = 0; | ||
2783 | ins->offset = 0; | ||
2784 | |||
2785 | if (orig_root->ref_cows || empty_size) | ||
2786 | allowed_chunk_alloc = 1; | ||
2787 | |||
2788 | if (data & BTRFS_BLOCK_GROUP_METADATA) { | ||
2789 | last_ptr = &root->fs_info->last_alloc; | ||
2790 | empty_cluster = 64 * 1024; | ||
2791 | } | ||
2792 | |||
2793 | if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) | ||
2794 | last_ptr = &root->fs_info->last_data_alloc; | ||
2795 | |||
2796 | if (last_ptr) { | ||
2797 | if (*last_ptr) { | ||
2798 | hint_byte = *last_ptr; | ||
2799 | last_wanted = *last_ptr; | ||
2800 | } else | ||
2801 | empty_size += empty_cluster; | ||
2802 | } else { | ||
2803 | empty_cluster = 0; | ||
2804 | } | ||
2805 | search_start = max(search_start, first_logical_byte(root, 0)); | ||
2806 | search_start = max(search_start, hint_byte); | ||
2807 | |||
2808 | if (last_wanted && search_start != last_wanted) { | ||
2809 | last_wanted = 0; | ||
2810 | empty_size += empty_cluster; | ||
2811 | } | ||
2812 | |||
2813 | total_needed += empty_size; | ||
2814 | block_group = btrfs_lookup_block_group(root->fs_info, search_start); | ||
2815 | if (!block_group) | ||
2816 | block_group = btrfs_lookup_first_block_group(root->fs_info, | ||
2817 | search_start); | ||
2818 | space_info = __find_space_info(root->fs_info, data); | ||
2819 | |||
2820 | down_read(&space_info->groups_sem); | ||
2821 | while (1) { | ||
2822 | struct btrfs_free_space *free_space; | ||
2823 | /* | ||
2824 | * the only way this happens if our hint points to a block | ||
2825 | * group thats not of the proper type, while looping this | ||
2826 | * should never happen | ||
2827 | */ | ||
2828 | if (empty_size) | ||
2829 | extra_loop = 1; | ||
2830 | |||
2831 | if (!block_group) | ||
2832 | goto new_group_no_lock; | ||
2833 | |||
2834 | if (unlikely(!block_group->cached)) { | ||
2835 | mutex_lock(&block_group->cache_mutex); | ||
2836 | ret = cache_block_group(root, block_group); | ||
2837 | mutex_unlock(&block_group->cache_mutex); | ||
2838 | if (ret) | ||
2839 | break; | ||
2840 | } | ||
2841 | |||
2842 | mutex_lock(&block_group->alloc_mutex); | ||
2843 | if (unlikely(!block_group_bits(block_group, data))) | ||
2844 | goto new_group; | ||
2845 | |||
2846 | if (unlikely(block_group->ro)) | ||
2847 | goto new_group; | ||
2848 | |||
2849 | free_space = btrfs_find_free_space(block_group, search_start, | ||
2850 | total_needed); | ||
2851 | if (free_space) { | ||
2852 | u64 start = block_group->key.objectid; | ||
2853 | u64 end = block_group->key.objectid + | ||
2854 | block_group->key.offset; | ||
2855 | |||
2856 | search_start = stripe_align(root, free_space->offset); | ||
2857 | |||
2858 | /* move on to the next group */ | ||
2859 | if (search_start + num_bytes >= search_end) | ||
2860 | goto new_group; | ||
2861 | |||
2862 | /* move on to the next group */ | ||
2863 | if (search_start + num_bytes > end) | ||
2864 | goto new_group; | ||
2865 | |||
2866 | if (last_wanted && search_start != last_wanted) { | ||
2867 | total_needed += empty_cluster; | ||
2868 | empty_size += empty_cluster; | ||
2869 | last_wanted = 0; | ||
2870 | /* | ||
2871 | * if search_start is still in this block group | ||
2872 | * then we just re-search this block group | ||
2873 | */ | ||
2874 | if (search_start >= start && | ||
2875 | search_start < end) { | ||
2876 | mutex_unlock(&block_group->alloc_mutex); | ||
2877 | continue; | ||
2878 | } | ||
2879 | |||
2880 | /* else we go to the next block group */ | ||
2881 | goto new_group; | ||
2882 | } | ||
2883 | |||
2884 | if (exclude_nr > 0 && | ||
2885 | (search_start + num_bytes > exclude_start && | ||
2886 | search_start < exclude_start + exclude_nr)) { | ||
2887 | search_start = exclude_start + exclude_nr; | ||
2888 | /* | ||
2889 | * if search_start is still in this block group | ||
2890 | * then we just re-search this block group | ||
2891 | */ | ||
2892 | if (search_start >= start && | ||
2893 | search_start < end) { | ||
2894 | mutex_unlock(&block_group->alloc_mutex); | ||
2895 | last_wanted = 0; | ||
2896 | continue; | ||
2897 | } | ||
2898 | |||
2899 | /* else we go to the next block group */ | ||
2900 | goto new_group; | ||
2901 | } | ||
2902 | |||
2903 | ins->objectid = search_start; | ||
2904 | ins->offset = num_bytes; | ||
2905 | |||
2906 | btrfs_remove_free_space_lock(block_group, search_start, | ||
2907 | num_bytes); | ||
2908 | /* we are all good, lets return */ | ||
2909 | mutex_unlock(&block_group->alloc_mutex); | ||
2910 | break; | ||
2911 | } | ||
2912 | new_group: | ||
2913 | mutex_unlock(&block_group->alloc_mutex); | ||
2914 | put_block_group(block_group); | ||
2915 | block_group = NULL; | ||
2916 | new_group_no_lock: | ||
2917 | /* don't try to compare new allocations against the | ||
2918 | * last allocation any more | ||
2919 | */ | ||
2920 | last_wanted = 0; | ||
2921 | |||
2922 | /* | ||
2923 | * Here's how this works. | ||
2924 | * loop == 0: we were searching a block group via a hint | ||
2925 | * and didn't find anything, so we start at | ||
2926 | * the head of the block groups and keep searching | ||
2927 | * loop == 1: we're searching through all of the block groups | ||
2928 | * if we hit the head again we have searched | ||
2929 | * all of the block groups for this space and we | ||
2930 | * need to try and allocate, if we cant error out. | ||
2931 | * loop == 2: we allocated more space and are looping through | ||
2932 | * all of the block groups again. | ||
2933 | */ | ||
2934 | if (loop == 0) { | ||
2935 | head = &space_info->block_groups; | ||
2936 | cur = head->next; | ||
2937 | loop++; | ||
2938 | } else if (loop == 1 && cur == head) { | ||
2939 | int keep_going; | ||
2940 | |||
2941 | /* at this point we give up on the empty_size | ||
2942 | * allocations and just try to allocate the min | ||
2943 | * space. | ||
2944 | * | ||
2945 | * The extra_loop field was set if an empty_size | ||
2946 | * allocation was attempted above, and if this | ||
2947 | * is try we need to try the loop again without | ||
2948 | * the additional empty_size. | ||
2949 | */ | ||
2950 | total_needed -= empty_size; | ||
2951 | empty_size = 0; | ||
2952 | keep_going = extra_loop; | ||
2953 | loop++; | ||
2954 | |||
2955 | if (allowed_chunk_alloc && !chunk_alloc_done) { | ||
2956 | up_read(&space_info->groups_sem); | ||
2957 | ret = do_chunk_alloc(trans, root, num_bytes + | ||
2958 | 2 * 1024 * 1024, data, 1); | ||
2959 | down_read(&space_info->groups_sem); | ||
2960 | if (ret < 0) | ||
2961 | goto loop_check; | ||
2962 | head = &space_info->block_groups; | ||
2963 | /* | ||
2964 | * we've allocated a new chunk, keep | ||
2965 | * trying | ||
2966 | */ | ||
2967 | keep_going = 1; | ||
2968 | chunk_alloc_done = 1; | ||
2969 | } else if (!allowed_chunk_alloc) { | ||
2970 | space_info->force_alloc = 1; | ||
2971 | } | ||
2972 | loop_check: | ||
2973 | if (keep_going) { | ||
2974 | cur = head->next; | ||
2975 | extra_loop = 0; | ||
2976 | } else { | ||
2977 | break; | ||
2978 | } | ||
2979 | } else if (cur == head) { | ||
2980 | break; | ||
2981 | } | ||
2982 | |||
2983 | block_group = list_entry(cur, struct btrfs_block_group_cache, | ||
2984 | list); | ||
2985 | atomic_inc(&block_group->count); | ||
2986 | |||
2987 | search_start = block_group->key.objectid; | ||
2988 | cur = cur->next; | ||
2989 | } | ||
2990 | |||
2991 | /* we found what we needed */ | ||
2992 | if (ins->objectid) { | ||
2993 | if (!(data & BTRFS_BLOCK_GROUP_DATA)) | ||
2994 | trans->block_group = block_group->key.objectid; | ||
2995 | |||
2996 | if (last_ptr) | ||
2997 | *last_ptr = ins->objectid + ins->offset; | ||
2998 | ret = 0; | ||
2999 | } else if (!ret) { | ||
3000 | printk(KERN_ERR "btrfs searching for %llu bytes, " | ||
3001 | "num_bytes %llu, loop %d, allowed_alloc %d\n", | ||
3002 | (unsigned long long)total_needed, | ||
3003 | (unsigned long long)num_bytes, | ||
3004 | loop, allowed_chunk_alloc); | ||
3005 | ret = -ENOSPC; | ||
3006 | } | ||
3007 | if (block_group) | ||
3008 | put_block_group(block_group); | ||
3009 | |||
3010 | up_read(&space_info->groups_sem); | ||
3011 | return ret; | ||
3012 | } | ||
3013 | |||
3014 | static void dump_space_info(struct btrfs_space_info *info, u64 bytes) | ||
3015 | { | ||
3016 | struct btrfs_block_group_cache *cache; | ||
3017 | struct list_head *l; | ||
3018 | |||
3019 | printk(KERN_INFO "space_info has %llu free, is %sfull\n", | ||
3020 | (unsigned long long)(info->total_bytes - info->bytes_used - | ||
3021 | info->bytes_pinned - info->bytes_reserved), | ||
3022 | (info->full) ? "" : "not "); | ||
3023 | |||
3024 | down_read(&info->groups_sem); | ||
3025 | list_for_each(l, &info->block_groups) { | ||
3026 | cache = list_entry(l, struct btrfs_block_group_cache, list); | ||
3027 | spin_lock(&cache->lock); | ||
3028 | printk(KERN_INFO "block group %llu has %llu bytes, %llu used " | ||
3029 | "%llu pinned %llu reserved\n", | ||
3030 | (unsigned long long)cache->key.objectid, | ||
3031 | (unsigned long long)cache->key.offset, | ||
3032 | (unsigned long long)btrfs_block_group_used(&cache->item), | ||
3033 | (unsigned long long)cache->pinned, | ||
3034 | (unsigned long long)cache->reserved); | ||
3035 | btrfs_dump_free_space(cache, bytes); | ||
3036 | spin_unlock(&cache->lock); | ||
3037 | } | ||
3038 | up_read(&info->groups_sem); | ||
3039 | } | ||
3040 | |||
3041 | static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, | ||
3042 | struct btrfs_root *root, | ||
3043 | u64 num_bytes, u64 min_alloc_size, | ||
3044 | u64 empty_size, u64 hint_byte, | ||
3045 | u64 search_end, struct btrfs_key *ins, | ||
3046 | u64 data) | ||
3047 | { | ||
3048 | int ret; | ||
3049 | u64 search_start = 0; | ||
3050 | u64 alloc_profile; | ||
3051 | struct btrfs_fs_info *info = root->fs_info; | ||
3052 | |||
3053 | if (data) { | ||
3054 | alloc_profile = info->avail_data_alloc_bits & | ||
3055 | info->data_alloc_profile; | ||
3056 | data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; | ||
3057 | } else if (root == root->fs_info->chunk_root) { | ||
3058 | alloc_profile = info->avail_system_alloc_bits & | ||
3059 | info->system_alloc_profile; | ||
3060 | data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; | ||
3061 | } else { | ||
3062 | alloc_profile = info->avail_metadata_alloc_bits & | ||
3063 | info->metadata_alloc_profile; | ||
3064 | data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; | ||
3065 | } | ||
3066 | again: | ||
3067 | data = btrfs_reduce_alloc_profile(root, data); | ||
3068 | /* | ||
3069 | * the only place that sets empty_size is btrfs_realloc_node, which | ||
3070 | * is not called recursively on allocations | ||
3071 | */ | ||
3072 | if (empty_size || root->ref_cows) { | ||
3073 | if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { | ||
3074 | ret = do_chunk_alloc(trans, root->fs_info->extent_root, | ||
3075 | 2 * 1024 * 1024, | ||
3076 | BTRFS_BLOCK_GROUP_METADATA | | ||
3077 | (info->metadata_alloc_profile & | ||
3078 | info->avail_metadata_alloc_bits), 0); | ||
3079 | } | ||
3080 | ret = do_chunk_alloc(trans, root->fs_info->extent_root, | ||
3081 | num_bytes + 2 * 1024 * 1024, data, 0); | ||
3082 | } | ||
3083 | |||
3084 | WARN_ON(num_bytes < root->sectorsize); | ||
3085 | ret = find_free_extent(trans, root, num_bytes, empty_size, | ||
3086 | search_start, search_end, hint_byte, ins, | ||
3087 | trans->alloc_exclude_start, | ||
3088 | trans->alloc_exclude_nr, data); | ||
3089 | |||
3090 | if (ret == -ENOSPC && num_bytes > min_alloc_size) { | ||
3091 | num_bytes = num_bytes >> 1; | ||
3092 | num_bytes = num_bytes & ~(root->sectorsize - 1); | ||
3093 | num_bytes = max(num_bytes, min_alloc_size); | ||
3094 | do_chunk_alloc(trans, root->fs_info->extent_root, | ||
3095 | num_bytes, data, 1); | ||
3096 | goto again; | ||
3097 | } | ||
3098 | if (ret) { | ||
3099 | struct btrfs_space_info *sinfo; | ||
3100 | |||
3101 | sinfo = __find_space_info(root->fs_info, data); | ||
3102 | printk(KERN_ERR "btrfs allocation failed flags %llu, " | ||
3103 | "wanted %llu\n", (unsigned long long)data, | ||
3104 | (unsigned long long)num_bytes); | ||
3105 | dump_space_info(sinfo, num_bytes); | ||
3106 | BUG(); | ||
3107 | } | ||
3108 | |||
3109 | return ret; | ||
3110 | } | ||
3111 | |||
3112 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) | ||
3113 | { | ||
3114 | struct btrfs_block_group_cache *cache; | ||
3115 | int ret = 0; | ||
3116 | |||
3117 | cache = btrfs_lookup_block_group(root->fs_info, start); | ||
3118 | if (!cache) { | ||
3119 | printk(KERN_ERR "Unable to find block group for %llu\n", | ||
3120 | (unsigned long long)start); | ||
3121 | return -ENOSPC; | ||
3122 | } | ||
3123 | |||
3124 | ret = btrfs_discard_extent(root, start, len); | ||
3125 | |||
3126 | btrfs_add_free_space(cache, start, len); | ||
3127 | put_block_group(cache); | ||
3128 | update_reserved_extents(root, start, len, 0); | ||
3129 | |||
3130 | return ret; | ||
3131 | } | ||
3132 | |||
3133 | int btrfs_reserve_extent(struct btrfs_trans_handle *trans, | ||
3134 | struct btrfs_root *root, | ||
3135 | u64 num_bytes, u64 min_alloc_size, | ||
3136 | u64 empty_size, u64 hint_byte, | ||
3137 | u64 search_end, struct btrfs_key *ins, | ||
3138 | u64 data) | ||
3139 | { | ||
3140 | int ret; | ||
3141 | ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, | ||
3142 | empty_size, hint_byte, search_end, ins, | ||
3143 | data); | ||
3144 | update_reserved_extents(root, ins->objectid, ins->offset, 1); | ||
3145 | return ret; | ||
3146 | } | ||
3147 | |||
3148 | static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, | ||
3149 | struct btrfs_root *root, u64 parent, | ||
3150 | u64 root_objectid, u64 ref_generation, | ||
3151 | u64 owner, struct btrfs_key *ins) | ||
3152 | { | ||
3153 | int ret; | ||
3154 | int pending_ret; | ||
3155 | u64 super_used; | ||
3156 | u64 root_used; | ||
3157 | u64 num_bytes = ins->offset; | ||
3158 | u32 sizes[2]; | ||
3159 | struct btrfs_fs_info *info = root->fs_info; | ||
3160 | struct btrfs_root *extent_root = info->extent_root; | ||
3161 | struct btrfs_extent_item *extent_item; | ||
3162 | struct btrfs_extent_ref *ref; | ||
3163 | struct btrfs_path *path; | ||
3164 | struct btrfs_key keys[2]; | ||
3165 | |||
3166 | if (parent == 0) | ||
3167 | parent = ins->objectid; | ||
3168 | |||
3169 | /* block accounting for super block */ | ||
3170 | spin_lock(&info->delalloc_lock); | ||
3171 | super_used = btrfs_super_bytes_used(&info->super_copy); | ||
3172 | btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); | ||
3173 | |||
3174 | /* block accounting for root item */ | ||
3175 | root_used = btrfs_root_used(&root->root_item); | ||
3176 | btrfs_set_root_used(&root->root_item, root_used + num_bytes); | ||
3177 | spin_unlock(&info->delalloc_lock); | ||
3178 | |||
3179 | if (root == extent_root) { | ||
3180 | struct pending_extent_op *extent_op; | ||
3181 | |||
3182 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | ||
3183 | BUG_ON(!extent_op); | ||
3184 | |||
3185 | extent_op->type = PENDING_EXTENT_INSERT; | ||
3186 | extent_op->bytenr = ins->objectid; | ||
3187 | extent_op->num_bytes = ins->offset; | ||
3188 | extent_op->parent = parent; | ||
3189 | extent_op->orig_parent = 0; | ||
3190 | extent_op->generation = ref_generation; | ||
3191 | extent_op->orig_generation = 0; | ||
3192 | extent_op->level = (int)owner; | ||
3193 | INIT_LIST_HEAD(&extent_op->list); | ||
3194 | extent_op->del = 0; | ||
3195 | |||
3196 | mutex_lock(&root->fs_info->extent_ins_mutex); | ||
3197 | set_extent_bits(&root->fs_info->extent_ins, ins->objectid, | ||
3198 | ins->objectid + ins->offset - 1, | ||
3199 | EXTENT_WRITEBACK, GFP_NOFS); | ||
3200 | set_state_private(&root->fs_info->extent_ins, | ||
3201 | ins->objectid, (unsigned long)extent_op); | ||
3202 | mutex_unlock(&root->fs_info->extent_ins_mutex); | ||
3203 | goto update_block; | ||
3204 | } | ||
3205 | |||
3206 | memcpy(&keys[0], ins, sizeof(*ins)); | ||
3207 | keys[1].objectid = ins->objectid; | ||
3208 | keys[1].type = BTRFS_EXTENT_REF_KEY; | ||
3209 | keys[1].offset = parent; | ||
3210 | sizes[0] = sizeof(*extent_item); | ||
3211 | sizes[1] = sizeof(*ref); | ||
3212 | |||
3213 | path = btrfs_alloc_path(); | ||
3214 | BUG_ON(!path); | ||
3215 | |||
3216 | ret = btrfs_insert_empty_items(trans, extent_root, path, keys, | ||
3217 | sizes, 2); | ||
3218 | BUG_ON(ret); | ||
3219 | |||
3220 | extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
3221 | struct btrfs_extent_item); | ||
3222 | btrfs_set_extent_refs(path->nodes[0], extent_item, 1); | ||
3223 | ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, | ||
3224 | struct btrfs_extent_ref); | ||
3225 | |||
3226 | btrfs_set_ref_root(path->nodes[0], ref, root_objectid); | ||
3227 | btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); | ||
3228 | btrfs_set_ref_objectid(path->nodes[0], ref, owner); | ||
3229 | btrfs_set_ref_num_refs(path->nodes[0], ref, 1); | ||
3230 | |||
3231 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
3232 | |||
3233 | trans->alloc_exclude_start = 0; | ||
3234 | trans->alloc_exclude_nr = 0; | ||
3235 | btrfs_free_path(path); | ||
3236 | finish_current_insert(trans, extent_root, 0); | ||
3237 | pending_ret = del_pending_extents(trans, extent_root, 0); | ||
3238 | |||
3239 | if (ret) | ||
3240 | goto out; | ||
3241 | if (pending_ret) { | ||
3242 | ret = pending_ret; | ||
3243 | goto out; | ||
3244 | } | ||
3245 | |||
3246 | update_block: | ||
3247 | ret = update_block_group(trans, root, ins->objectid, | ||
3248 | ins->offset, 1, 0); | ||
3249 | if (ret) { | ||
3250 | printk(KERN_ERR "btrfs update block group failed for %llu " | ||
3251 | "%llu\n", (unsigned long long)ins->objectid, | ||
3252 | (unsigned long long)ins->offset); | ||
3253 | BUG(); | ||
3254 | } | ||
3255 | out: | ||
3256 | return ret; | ||
3257 | } | ||
3258 | |||
3259 | int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, | ||
3260 | struct btrfs_root *root, u64 parent, | ||
3261 | u64 root_objectid, u64 ref_generation, | ||
3262 | u64 owner, struct btrfs_key *ins) | ||
3263 | { | ||
3264 | int ret; | ||
3265 | |||
3266 | if (root_objectid == BTRFS_TREE_LOG_OBJECTID) | ||
3267 | return 0; | ||
3268 | ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, | ||
3269 | ref_generation, owner, ins); | ||
3270 | update_reserved_extents(root, ins->objectid, ins->offset, 0); | ||
3271 | return ret; | ||
3272 | } | ||
3273 | |||
3274 | /* | ||
3275 | * this is used by the tree logging recovery code. It records that | ||
3276 | * an extent has been allocated and makes sure to clear the free | ||
3277 | * space cache bits as well | ||
3278 | */ | ||
3279 | int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, | ||
3280 | struct btrfs_root *root, u64 parent, | ||
3281 | u64 root_objectid, u64 ref_generation, | ||
3282 | u64 owner, struct btrfs_key *ins) | ||
3283 | { | ||
3284 | int ret; | ||
3285 | struct btrfs_block_group_cache *block_group; | ||
3286 | |||
3287 | block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); | ||
3288 | mutex_lock(&block_group->cache_mutex); | ||
3289 | cache_block_group(root, block_group); | ||
3290 | mutex_unlock(&block_group->cache_mutex); | ||
3291 | |||
3292 | ret = btrfs_remove_free_space(block_group, ins->objectid, | ||
3293 | ins->offset); | ||
3294 | BUG_ON(ret); | ||
3295 | put_block_group(block_group); | ||
3296 | ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, | ||
3297 | ref_generation, owner, ins); | ||
3298 | return ret; | ||
3299 | } | ||
3300 | |||
3301 | /* | ||
3302 | * finds a free extent and does all the dirty work required for allocation | ||
3303 | * returns the key for the extent through ins, and a tree buffer for | ||
3304 | * the first block of the extent through buf. | ||
3305 | * | ||
3306 | * returns 0 if everything worked, non-zero otherwise. | ||
3307 | */ | ||
3308 | int btrfs_alloc_extent(struct btrfs_trans_handle *trans, | ||
3309 | struct btrfs_root *root, | ||
3310 | u64 num_bytes, u64 parent, u64 min_alloc_size, | ||
3311 | u64 root_objectid, u64 ref_generation, | ||
3312 | u64 owner_objectid, u64 empty_size, u64 hint_byte, | ||
3313 | u64 search_end, struct btrfs_key *ins, u64 data) | ||
3314 | { | ||
3315 | int ret; | ||
3316 | |||
3317 | ret = __btrfs_reserve_extent(trans, root, num_bytes, | ||
3318 | min_alloc_size, empty_size, hint_byte, | ||
3319 | search_end, ins, data); | ||
3320 | BUG_ON(ret); | ||
3321 | if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { | ||
3322 | ret = __btrfs_alloc_reserved_extent(trans, root, parent, | ||
3323 | root_objectid, ref_generation, | ||
3324 | owner_objectid, ins); | ||
3325 | BUG_ON(ret); | ||
3326 | |||
3327 | } else { | ||
3328 | update_reserved_extents(root, ins->objectid, ins->offset, 1); | ||
3329 | } | ||
3330 | return ret; | ||
3331 | } | ||
3332 | |||
3333 | struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, | ||
3334 | struct btrfs_root *root, | ||
3335 | u64 bytenr, u32 blocksize) | ||
3336 | { | ||
3337 | struct extent_buffer *buf; | ||
3338 | |||
3339 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); | ||
3340 | if (!buf) | ||
3341 | return ERR_PTR(-ENOMEM); | ||
3342 | btrfs_set_header_generation(buf, trans->transid); | ||
3343 | btrfs_tree_lock(buf); | ||
3344 | clean_tree_block(trans, root, buf); | ||
3345 | btrfs_set_buffer_uptodate(buf); | ||
3346 | if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { | ||
3347 | set_extent_dirty(&root->dirty_log_pages, buf->start, | ||
3348 | buf->start + buf->len - 1, GFP_NOFS); | ||
3349 | } else { | ||
3350 | set_extent_dirty(&trans->transaction->dirty_pages, buf->start, | ||
3351 | buf->start + buf->len - 1, GFP_NOFS); | ||
3352 | } | ||
3353 | trans->blocks_used++; | ||
3354 | return buf; | ||
3355 | } | ||
3356 | |||
3357 | /* | ||
3358 | * helper function to allocate a block for a given tree | ||
3359 | * returns the tree buffer or NULL. | ||
3360 | */ | ||
3361 | struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | ||
3362 | struct btrfs_root *root, | ||
3363 | u32 blocksize, u64 parent, | ||
3364 | u64 root_objectid, | ||
3365 | u64 ref_generation, | ||
3366 | int level, | ||
3367 | u64 hint, | ||
3368 | u64 empty_size) | ||
3369 | { | ||
3370 | struct btrfs_key ins; | ||
3371 | int ret; | ||
3372 | struct extent_buffer *buf; | ||
3373 | |||
3374 | ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, | ||
3375 | root_objectid, ref_generation, level, | ||
3376 | empty_size, hint, (u64)-1, &ins, 0); | ||
3377 | if (ret) { | ||
3378 | BUG_ON(ret > 0); | ||
3379 | return ERR_PTR(ret); | ||
3380 | } | ||
3381 | |||
3382 | buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); | ||
3383 | return buf; | ||
3384 | } | ||
3385 | |||
3386 | int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, | ||
3387 | struct btrfs_root *root, struct extent_buffer *leaf) | ||
3388 | { | ||
3389 | u64 leaf_owner; | ||
3390 | u64 leaf_generation; | ||
3391 | struct btrfs_key key; | ||
3392 | struct btrfs_file_extent_item *fi; | ||
3393 | int i; | ||
3394 | int nritems; | ||
3395 | int ret; | ||
3396 | |||
3397 | BUG_ON(!btrfs_is_leaf(leaf)); | ||
3398 | nritems = btrfs_header_nritems(leaf); | ||
3399 | leaf_owner = btrfs_header_owner(leaf); | ||
3400 | leaf_generation = btrfs_header_generation(leaf); | ||
3401 | |||
3402 | for (i = 0; i < nritems; i++) { | ||
3403 | u64 disk_bytenr; | ||
3404 | cond_resched(); | ||
3405 | |||
3406 | btrfs_item_key_to_cpu(leaf, &key, i); | ||
3407 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
3408 | continue; | ||
3409 | fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); | ||
3410 | if (btrfs_file_extent_type(leaf, fi) == | ||
3411 | BTRFS_FILE_EXTENT_INLINE) | ||
3412 | continue; | ||
3413 | /* | ||
3414 | * FIXME make sure to insert a trans record that | ||
3415 | * repeats the snapshot del on crash | ||
3416 | */ | ||
3417 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); | ||
3418 | if (disk_bytenr == 0) | ||
3419 | continue; | ||
3420 | |||
3421 | ret = __btrfs_free_extent(trans, root, disk_bytenr, | ||
3422 | btrfs_file_extent_disk_num_bytes(leaf, fi), | ||
3423 | leaf->start, leaf_owner, leaf_generation, | ||
3424 | key.objectid, 0); | ||
3425 | BUG_ON(ret); | ||
3426 | |||
3427 | atomic_inc(&root->fs_info->throttle_gen); | ||
3428 | wake_up(&root->fs_info->transaction_throttle); | ||
3429 | cond_resched(); | ||
3430 | } | ||
3431 | return 0; | ||
3432 | } | ||
3433 | |||
3434 | static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, | ||
3435 | struct btrfs_root *root, | ||
3436 | struct btrfs_leaf_ref *ref) | ||
3437 | { | ||
3438 | int i; | ||
3439 | int ret; | ||
3440 | struct btrfs_extent_info *info = ref->extents; | ||
3441 | |||
3442 | for (i = 0; i < ref->nritems; i++) { | ||
3443 | ret = __btrfs_free_extent(trans, root, info->bytenr, | ||
3444 | info->num_bytes, ref->bytenr, | ||
3445 | ref->owner, ref->generation, | ||
3446 | info->objectid, 0); | ||
3447 | |||
3448 | atomic_inc(&root->fs_info->throttle_gen); | ||
3449 | wake_up(&root->fs_info->transaction_throttle); | ||
3450 | cond_resched(); | ||
3451 | |||
3452 | BUG_ON(ret); | ||
3453 | info++; | ||
3454 | } | ||
3455 | |||
3456 | return 0; | ||
3457 | } | ||
3458 | |||
3459 | static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, | ||
3460 | u64 len, u32 *refs) | ||
3461 | { | ||
3462 | int ret; | ||
3463 | |||
3464 | ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); | ||
3465 | BUG_ON(ret); | ||
3466 | |||
3467 | #if 0 /* some debugging code in case we see problems here */ | ||
3468 | /* if the refs count is one, it won't get increased again. But | ||
3469 | * if the ref count is > 1, someone may be decreasing it at | ||
3470 | * the same time we are. | ||
3471 | */ | ||
3472 | if (*refs != 1) { | ||
3473 | struct extent_buffer *eb = NULL; | ||
3474 | eb = btrfs_find_create_tree_block(root, start, len); | ||
3475 | if (eb) | ||
3476 | btrfs_tree_lock(eb); | ||
3477 | |||
3478 | mutex_lock(&root->fs_info->alloc_mutex); | ||
3479 | ret = lookup_extent_ref(NULL, root, start, len, refs); | ||
3480 | BUG_ON(ret); | ||
3481 | mutex_unlock(&root->fs_info->alloc_mutex); | ||
3482 | |||
3483 | if (eb) { | ||
3484 | btrfs_tree_unlock(eb); | ||
3485 | free_extent_buffer(eb); | ||
3486 | } | ||
3487 | if (*refs == 1) { | ||
3488 | printk(KERN_ERR "btrfs block %llu went down to one " | ||
3489 | "during drop_snap\n", (unsigned long long)start); | ||
3490 | } | ||
3491 | |||
3492 | } | ||
3493 | #endif | ||
3494 | |||
3495 | cond_resched(); | ||
3496 | return ret; | ||
3497 | } | ||
3498 | |||
3499 | /* | ||
3500 | * helper function for drop_snapshot, this walks down the tree dropping ref | ||
3501 | * counts as it goes. | ||
3502 | */ | ||
3503 | static noinline int walk_down_tree(struct btrfs_trans_handle *trans, | ||
3504 | struct btrfs_root *root, | ||
3505 | struct btrfs_path *path, int *level) | ||
3506 | { | ||
3507 | u64 root_owner; | ||
3508 | u64 root_gen; | ||
3509 | u64 bytenr; | ||
3510 | u64 ptr_gen; | ||
3511 | struct extent_buffer *next; | ||
3512 | struct extent_buffer *cur; | ||
3513 | struct extent_buffer *parent; | ||
3514 | struct btrfs_leaf_ref *ref; | ||
3515 | u32 blocksize; | ||
3516 | int ret; | ||
3517 | u32 refs; | ||
3518 | |||
3519 | WARN_ON(*level < 0); | ||
3520 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
3521 | ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, | ||
3522 | path->nodes[*level]->len, &refs); | ||
3523 | BUG_ON(ret); | ||
3524 | if (refs > 1) | ||
3525 | goto out; | ||
3526 | |||
3527 | /* | ||
3528 | * walk down to the last node level and free all the leaves | ||
3529 | */ | ||
3530 | while (*level >= 0) { | ||
3531 | WARN_ON(*level < 0); | ||
3532 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
3533 | cur = path->nodes[*level]; | ||
3534 | |||
3535 | if (btrfs_header_level(cur) != *level) | ||
3536 | WARN_ON(1); | ||
3537 | |||
3538 | if (path->slots[*level] >= | ||
3539 | btrfs_header_nritems(cur)) | ||
3540 | break; | ||
3541 | if (*level == 0) { | ||
3542 | ret = btrfs_drop_leaf_ref(trans, root, cur); | ||
3543 | BUG_ON(ret); | ||
3544 | break; | ||
3545 | } | ||
3546 | bytenr = btrfs_node_blockptr(cur, path->slots[*level]); | ||
3547 | ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); | ||
3548 | blocksize = btrfs_level_size(root, *level - 1); | ||
3549 | |||
3550 | ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); | ||
3551 | BUG_ON(ret); | ||
3552 | if (refs != 1) { | ||
3553 | parent = path->nodes[*level]; | ||
3554 | root_owner = btrfs_header_owner(parent); | ||
3555 | root_gen = btrfs_header_generation(parent); | ||
3556 | path->slots[*level]++; | ||
3557 | |||
3558 | ret = __btrfs_free_extent(trans, root, bytenr, | ||
3559 | blocksize, parent->start, | ||
3560 | root_owner, root_gen, | ||
3561 | *level - 1, 1); | ||
3562 | BUG_ON(ret); | ||
3563 | |||
3564 | atomic_inc(&root->fs_info->throttle_gen); | ||
3565 | wake_up(&root->fs_info->transaction_throttle); | ||
3566 | cond_resched(); | ||
3567 | |||
3568 | continue; | ||
3569 | } | ||
3570 | /* | ||
3571 | * at this point, we have a single ref, and since the | ||
3572 | * only place referencing this extent is a dead root | ||
3573 | * the reference count should never go higher. | ||
3574 | * So, we don't need to check it again | ||
3575 | */ | ||
3576 | if (*level == 1) { | ||
3577 | ref = btrfs_lookup_leaf_ref(root, bytenr); | ||
3578 | if (ref && ref->generation != ptr_gen) { | ||
3579 | btrfs_free_leaf_ref(root, ref); | ||
3580 | ref = NULL; | ||
3581 | } | ||
3582 | if (ref) { | ||
3583 | ret = cache_drop_leaf_ref(trans, root, ref); | ||
3584 | BUG_ON(ret); | ||
3585 | btrfs_remove_leaf_ref(root, ref); | ||
3586 | btrfs_free_leaf_ref(root, ref); | ||
3587 | *level = 0; | ||
3588 | break; | ||
3589 | } | ||
3590 | } | ||
3591 | next = btrfs_find_tree_block(root, bytenr, blocksize); | ||
3592 | if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { | ||
3593 | free_extent_buffer(next); | ||
3594 | |||
3595 | next = read_tree_block(root, bytenr, blocksize, | ||
3596 | ptr_gen); | ||
3597 | cond_resched(); | ||
3598 | #if 0 | ||
3599 | /* | ||
3600 | * this is a debugging check and can go away | ||
3601 | * the ref should never go all the way down to 1 | ||
3602 | * at this point | ||
3603 | */ | ||
3604 | ret = lookup_extent_ref(NULL, root, bytenr, blocksize, | ||
3605 | &refs); | ||
3606 | BUG_ON(ret); | ||
3607 | WARN_ON(refs != 1); | ||
3608 | #endif | ||
3609 | } | ||
3610 | WARN_ON(*level <= 0); | ||
3611 | if (path->nodes[*level-1]) | ||
3612 | free_extent_buffer(path->nodes[*level-1]); | ||
3613 | path->nodes[*level-1] = next; | ||
3614 | *level = btrfs_header_level(next); | ||
3615 | path->slots[*level] = 0; | ||
3616 | cond_resched(); | ||
3617 | } | ||
3618 | out: | ||
3619 | WARN_ON(*level < 0); | ||
3620 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
3621 | |||
3622 | if (path->nodes[*level] == root->node) { | ||
3623 | parent = path->nodes[*level]; | ||
3624 | bytenr = path->nodes[*level]->start; | ||
3625 | } else { | ||
3626 | parent = path->nodes[*level + 1]; | ||
3627 | bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); | ||
3628 | } | ||
3629 | |||
3630 | blocksize = btrfs_level_size(root, *level); | ||
3631 | root_owner = btrfs_header_owner(parent); | ||
3632 | root_gen = btrfs_header_generation(parent); | ||
3633 | |||
3634 | ret = __btrfs_free_extent(trans, root, bytenr, blocksize, | ||
3635 | parent->start, root_owner, root_gen, | ||
3636 | *level, 1); | ||
3637 | free_extent_buffer(path->nodes[*level]); | ||
3638 | path->nodes[*level] = NULL; | ||
3639 | *level += 1; | ||
3640 | BUG_ON(ret); | ||
3641 | |||
3642 | cond_resched(); | ||
3643 | return 0; | ||
3644 | } | ||
3645 | |||
3646 | /* | ||
3647 | * helper function for drop_subtree, this function is similar to | ||
3648 | * walk_down_tree. The main difference is that it checks reference | ||
3649 | * counts while tree blocks are locked. | ||
3650 | */ | ||
3651 | static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, | ||
3652 | struct btrfs_root *root, | ||
3653 | struct btrfs_path *path, int *level) | ||
3654 | { | ||
3655 | struct extent_buffer *next; | ||
3656 | struct extent_buffer *cur; | ||
3657 | struct extent_buffer *parent; | ||
3658 | u64 bytenr; | ||
3659 | u64 ptr_gen; | ||
3660 | u32 blocksize; | ||
3661 | u32 refs; | ||
3662 | int ret; | ||
3663 | |||
3664 | cur = path->nodes[*level]; | ||
3665 | ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, | ||
3666 | &refs); | ||
3667 | BUG_ON(ret); | ||
3668 | if (refs > 1) | ||
3669 | goto out; | ||
3670 | |||
3671 | while (*level >= 0) { | ||
3672 | cur = path->nodes[*level]; | ||
3673 | if (*level == 0) { | ||
3674 | ret = btrfs_drop_leaf_ref(trans, root, cur); | ||
3675 | BUG_ON(ret); | ||
3676 | clean_tree_block(trans, root, cur); | ||
3677 | break; | ||
3678 | } | ||
3679 | if (path->slots[*level] >= btrfs_header_nritems(cur)) { | ||
3680 | clean_tree_block(trans, root, cur); | ||
3681 | break; | ||
3682 | } | ||
3683 | |||
3684 | bytenr = btrfs_node_blockptr(cur, path->slots[*level]); | ||
3685 | blocksize = btrfs_level_size(root, *level - 1); | ||
3686 | ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); | ||
3687 | |||
3688 | next = read_tree_block(root, bytenr, blocksize, ptr_gen); | ||
3689 | btrfs_tree_lock(next); | ||
3690 | |||
3691 | ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, | ||
3692 | &refs); | ||
3693 | BUG_ON(ret); | ||
3694 | if (refs > 1) { | ||
3695 | parent = path->nodes[*level]; | ||
3696 | ret = btrfs_free_extent(trans, root, bytenr, | ||
3697 | blocksize, parent->start, | ||
3698 | btrfs_header_owner(parent), | ||
3699 | btrfs_header_generation(parent), | ||
3700 | *level - 1, 1); | ||
3701 | BUG_ON(ret); | ||
3702 | path->slots[*level]++; | ||
3703 | btrfs_tree_unlock(next); | ||
3704 | free_extent_buffer(next); | ||
3705 | continue; | ||
3706 | } | ||
3707 | |||
3708 | *level = btrfs_header_level(next); | ||
3709 | path->nodes[*level] = next; | ||
3710 | path->slots[*level] = 0; | ||
3711 | path->locks[*level] = 1; | ||
3712 | cond_resched(); | ||
3713 | } | ||
3714 | out: | ||
3715 | parent = path->nodes[*level + 1]; | ||
3716 | bytenr = path->nodes[*level]->start; | ||
3717 | blocksize = path->nodes[*level]->len; | ||
3718 | |||
3719 | ret = btrfs_free_extent(trans, root, bytenr, blocksize, | ||
3720 | parent->start, btrfs_header_owner(parent), | ||
3721 | btrfs_header_generation(parent), *level, 1); | ||
3722 | BUG_ON(ret); | ||
3723 | |||
3724 | if (path->locks[*level]) { | ||
3725 | btrfs_tree_unlock(path->nodes[*level]); | ||
3726 | path->locks[*level] = 0; | ||
3727 | } | ||
3728 | free_extent_buffer(path->nodes[*level]); | ||
3729 | path->nodes[*level] = NULL; | ||
3730 | *level += 1; | ||
3731 | cond_resched(); | ||
3732 | return 0; | ||
3733 | } | ||
3734 | |||
3735 | /* | ||
3736 | * helper for dropping snapshots. This walks back up the tree in the path | ||
3737 | * to find the first node higher up where we haven't yet gone through | ||
3738 | * all the slots | ||
3739 | */ | ||
3740 | static noinline int walk_up_tree(struct btrfs_trans_handle *trans, | ||
3741 | struct btrfs_root *root, | ||
3742 | struct btrfs_path *path, | ||
3743 | int *level, int max_level) | ||
3744 | { | ||
3745 | u64 root_owner; | ||
3746 | u64 root_gen; | ||
3747 | struct btrfs_root_item *root_item = &root->root_item; | ||
3748 | int i; | ||
3749 | int slot; | ||
3750 | int ret; | ||
3751 | |||
3752 | for (i = *level; i < max_level && path->nodes[i]; i++) { | ||
3753 | slot = path->slots[i]; | ||
3754 | if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { | ||
3755 | struct extent_buffer *node; | ||
3756 | struct btrfs_disk_key disk_key; | ||
3757 | node = path->nodes[i]; | ||
3758 | path->slots[i]++; | ||
3759 | *level = i; | ||
3760 | WARN_ON(*level == 0); | ||
3761 | btrfs_node_key(node, &disk_key, path->slots[i]); | ||
3762 | memcpy(&root_item->drop_progress, | ||
3763 | &disk_key, sizeof(disk_key)); | ||
3764 | root_item->drop_level = i; | ||
3765 | return 0; | ||
3766 | } else { | ||
3767 | struct extent_buffer *parent; | ||
3768 | if (path->nodes[*level] == root->node) | ||
3769 | parent = path->nodes[*level]; | ||
3770 | else | ||
3771 | parent = path->nodes[*level + 1]; | ||
3772 | |||
3773 | root_owner = btrfs_header_owner(parent); | ||
3774 | root_gen = btrfs_header_generation(parent); | ||
3775 | |||
3776 | clean_tree_block(trans, root, path->nodes[*level]); | ||
3777 | ret = btrfs_free_extent(trans, root, | ||
3778 | path->nodes[*level]->start, | ||
3779 | path->nodes[*level]->len, | ||
3780 | parent->start, root_owner, | ||
3781 | root_gen, *level, 1); | ||
3782 | BUG_ON(ret); | ||
3783 | if (path->locks[*level]) { | ||
3784 | btrfs_tree_unlock(path->nodes[*level]); | ||
3785 | path->locks[*level] = 0; | ||
3786 | } | ||
3787 | free_extent_buffer(path->nodes[*level]); | ||
3788 | path->nodes[*level] = NULL; | ||
3789 | *level = i + 1; | ||
3790 | } | ||
3791 | } | ||
3792 | return 1; | ||
3793 | } | ||
3794 | |||
3795 | /* | ||
3796 | * drop the reference count on the tree rooted at 'snap'. This traverses | ||
3797 | * the tree freeing any blocks that have a ref count of zero after being | ||
3798 | * decremented. | ||
3799 | */ | ||
3800 | int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | ||
3801 | *root) | ||
3802 | { | ||
3803 | int ret = 0; | ||
3804 | int wret; | ||
3805 | int level; | ||
3806 | struct btrfs_path *path; | ||
3807 | int i; | ||
3808 | int orig_level; | ||
3809 | struct btrfs_root_item *root_item = &root->root_item; | ||
3810 | |||
3811 | WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); | ||
3812 | path = btrfs_alloc_path(); | ||
3813 | BUG_ON(!path); | ||
3814 | |||
3815 | level = btrfs_header_level(root->node); | ||
3816 | orig_level = level; | ||
3817 | if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { | ||
3818 | path->nodes[level] = root->node; | ||
3819 | extent_buffer_get(root->node); | ||
3820 | path->slots[level] = 0; | ||
3821 | } else { | ||
3822 | struct btrfs_key key; | ||
3823 | struct btrfs_disk_key found_key; | ||
3824 | struct extent_buffer *node; | ||
3825 | |||
3826 | btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); | ||
3827 | level = root_item->drop_level; | ||
3828 | path->lowest_level = level; | ||
3829 | wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3830 | if (wret < 0) { | ||
3831 | ret = wret; | ||
3832 | goto out; | ||
3833 | } | ||
3834 | node = path->nodes[level]; | ||
3835 | btrfs_node_key(node, &found_key, path->slots[level]); | ||
3836 | WARN_ON(memcmp(&found_key, &root_item->drop_progress, | ||
3837 | sizeof(found_key))); | ||
3838 | /* | ||
3839 | * unlock our path, this is safe because only this | ||
3840 | * function is allowed to delete this snapshot | ||
3841 | */ | ||
3842 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { | ||
3843 | if (path->nodes[i] && path->locks[i]) { | ||
3844 | path->locks[i] = 0; | ||
3845 | btrfs_tree_unlock(path->nodes[i]); | ||
3846 | } | ||
3847 | } | ||
3848 | } | ||
3849 | while (1) { | ||
3850 | wret = walk_down_tree(trans, root, path, &level); | ||
3851 | if (wret > 0) | ||
3852 | break; | ||
3853 | if (wret < 0) | ||
3854 | ret = wret; | ||
3855 | |||
3856 | wret = walk_up_tree(trans, root, path, &level, | ||
3857 | BTRFS_MAX_LEVEL); | ||
3858 | if (wret > 0) | ||
3859 | break; | ||
3860 | if (wret < 0) | ||
3861 | ret = wret; | ||
3862 | if (trans->transaction->in_commit) { | ||
3863 | ret = -EAGAIN; | ||
3864 | break; | ||
3865 | } | ||
3866 | atomic_inc(&root->fs_info->throttle_gen); | ||
3867 | wake_up(&root->fs_info->transaction_throttle); | ||
3868 | } | ||
3869 | for (i = 0; i <= orig_level; i++) { | ||
3870 | if (path->nodes[i]) { | ||
3871 | free_extent_buffer(path->nodes[i]); | ||
3872 | path->nodes[i] = NULL; | ||
3873 | } | ||
3874 | } | ||
3875 | out: | ||
3876 | btrfs_free_path(path); | ||
3877 | return ret; | ||
3878 | } | ||
3879 | |||
3880 | int btrfs_drop_subtree(struct btrfs_trans_handle *trans, | ||
3881 | struct btrfs_root *root, | ||
3882 | struct extent_buffer *node, | ||
3883 | struct extent_buffer *parent) | ||
3884 | { | ||
3885 | struct btrfs_path *path; | ||
3886 | int level; | ||
3887 | int parent_level; | ||
3888 | int ret = 0; | ||
3889 | int wret; | ||
3890 | |||
3891 | path = btrfs_alloc_path(); | ||
3892 | BUG_ON(!path); | ||
3893 | |||
3894 | BUG_ON(!btrfs_tree_locked(parent)); | ||
3895 | parent_level = btrfs_header_level(parent); | ||
3896 | extent_buffer_get(parent); | ||
3897 | path->nodes[parent_level] = parent; | ||
3898 | path->slots[parent_level] = btrfs_header_nritems(parent); | ||
3899 | |||
3900 | BUG_ON(!btrfs_tree_locked(node)); | ||
3901 | level = btrfs_header_level(node); | ||
3902 | extent_buffer_get(node); | ||
3903 | path->nodes[level] = node; | ||
3904 | path->slots[level] = 0; | ||
3905 | |||
3906 | while (1) { | ||
3907 | wret = walk_down_subtree(trans, root, path, &level); | ||
3908 | if (wret < 0) | ||
3909 | ret = wret; | ||
3910 | if (wret != 0) | ||
3911 | break; | ||
3912 | |||
3913 | wret = walk_up_tree(trans, root, path, &level, parent_level); | ||
3914 | if (wret < 0) | ||
3915 | ret = wret; | ||
3916 | if (wret != 0) | ||
3917 | break; | ||
3918 | } | ||
3919 | |||
3920 | btrfs_free_path(path); | ||
3921 | return ret; | ||
3922 | } | ||
3923 | |||
3924 | static unsigned long calc_ra(unsigned long start, unsigned long last, | ||
3925 | unsigned long nr) | ||
3926 | { | ||
3927 | return min(last, start + nr - 1); | ||
3928 | } | ||
3929 | |||
3930 | static noinline int relocate_inode_pages(struct inode *inode, u64 start, | ||
3931 | u64 len) | ||
3932 | { | ||
3933 | u64 page_start; | ||
3934 | u64 page_end; | ||
3935 | unsigned long first_index; | ||
3936 | unsigned long last_index; | ||
3937 | unsigned long i; | ||
3938 | struct page *page; | ||
3939 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
3940 | struct file_ra_state *ra; | ||
3941 | struct btrfs_ordered_extent *ordered; | ||
3942 | unsigned int total_read = 0; | ||
3943 | unsigned int total_dirty = 0; | ||
3944 | int ret = 0; | ||
3945 | |||
3946 | ra = kzalloc(sizeof(*ra), GFP_NOFS); | ||
3947 | |||
3948 | mutex_lock(&inode->i_mutex); | ||
3949 | first_index = start >> PAGE_CACHE_SHIFT; | ||
3950 | last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; | ||
3951 | |||
3952 | /* make sure the dirty trick played by the caller work */ | ||
3953 | ret = invalidate_inode_pages2_range(inode->i_mapping, | ||
3954 | first_index, last_index); | ||
3955 | if (ret) | ||
3956 | goto out_unlock; | ||
3957 | |||
3958 | file_ra_state_init(ra, inode->i_mapping); | ||
3959 | |||
3960 | for (i = first_index ; i <= last_index; i++) { | ||
3961 | if (total_read % ra->ra_pages == 0) { | ||
3962 | btrfs_force_ra(inode->i_mapping, ra, NULL, i, | ||
3963 | calc_ra(i, last_index, ra->ra_pages)); | ||
3964 | } | ||
3965 | total_read++; | ||
3966 | again: | ||
3967 | if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) | ||
3968 | BUG_ON(1); | ||
3969 | page = grab_cache_page(inode->i_mapping, i); | ||
3970 | if (!page) { | ||
3971 | ret = -ENOMEM; | ||
3972 | goto out_unlock; | ||
3973 | } | ||
3974 | if (!PageUptodate(page)) { | ||
3975 | btrfs_readpage(NULL, page); | ||
3976 | lock_page(page); | ||
3977 | if (!PageUptodate(page)) { | ||
3978 | unlock_page(page); | ||
3979 | page_cache_release(page); | ||
3980 | ret = -EIO; | ||
3981 | goto out_unlock; | ||
3982 | } | ||
3983 | } | ||
3984 | wait_on_page_writeback(page); | ||
3985 | |||
3986 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
3987 | page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
3988 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
3989 | |||
3990 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | ||
3991 | if (ordered) { | ||
3992 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
3993 | unlock_page(page); | ||
3994 | page_cache_release(page); | ||
3995 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
3996 | btrfs_put_ordered_extent(ordered); | ||
3997 | goto again; | ||
3998 | } | ||
3999 | set_page_extent_mapped(page); | ||
4000 | |||
4001 | if (i == first_index) | ||
4002 | set_extent_bits(io_tree, page_start, page_end, | ||
4003 | EXTENT_BOUNDARY, GFP_NOFS); | ||
4004 | btrfs_set_extent_delalloc(inode, page_start, page_end); | ||
4005 | |||
4006 | set_page_dirty(page); | ||
4007 | total_dirty++; | ||
4008 | |||
4009 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
4010 | unlock_page(page); | ||
4011 | page_cache_release(page); | ||
4012 | } | ||
4013 | |||
4014 | out_unlock: | ||
4015 | kfree(ra); | ||
4016 | mutex_unlock(&inode->i_mutex); | ||
4017 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); | ||
4018 | return ret; | ||
4019 | } | ||
4020 | |||
4021 | static noinline int relocate_data_extent(struct inode *reloc_inode, | ||
4022 | struct btrfs_key *extent_key, | ||
4023 | u64 offset) | ||
4024 | { | ||
4025 | struct btrfs_root *root = BTRFS_I(reloc_inode)->root; | ||
4026 | struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; | ||
4027 | struct extent_map *em; | ||
4028 | u64 start = extent_key->objectid - offset; | ||
4029 | u64 end = start + extent_key->offset - 1; | ||
4030 | |||
4031 | em = alloc_extent_map(GFP_NOFS); | ||
4032 | BUG_ON(!em || IS_ERR(em)); | ||
4033 | |||
4034 | em->start = start; | ||
4035 | em->len = extent_key->offset; | ||
4036 | em->block_len = extent_key->offset; | ||
4037 | em->block_start = extent_key->objectid; | ||
4038 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
4039 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
4040 | |||
4041 | /* setup extent map to cheat btrfs_readpage */ | ||
4042 | lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); | ||
4043 | while (1) { | ||
4044 | int ret; | ||
4045 | spin_lock(&em_tree->lock); | ||
4046 | ret = add_extent_mapping(em_tree, em); | ||
4047 | spin_unlock(&em_tree->lock); | ||
4048 | if (ret != -EEXIST) { | ||
4049 | free_extent_map(em); | ||
4050 | break; | ||
4051 | } | ||
4052 | btrfs_drop_extent_cache(reloc_inode, start, end, 0); | ||
4053 | } | ||
4054 | unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); | ||
4055 | |||
4056 | return relocate_inode_pages(reloc_inode, start, extent_key->offset); | ||
4057 | } | ||
4058 | |||
4059 | struct btrfs_ref_path { | ||
4060 | u64 extent_start; | ||
4061 | u64 nodes[BTRFS_MAX_LEVEL]; | ||
4062 | u64 root_objectid; | ||
4063 | u64 root_generation; | ||
4064 | u64 owner_objectid; | ||
4065 | u32 num_refs; | ||
4066 | int lowest_level; | ||
4067 | int current_level; | ||
4068 | int shared_level; | ||
4069 | |||
4070 | struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; | ||
4071 | u64 new_nodes[BTRFS_MAX_LEVEL]; | ||
4072 | }; | ||
4073 | |||
4074 | struct disk_extent { | ||
4075 | u64 ram_bytes; | ||
4076 | u64 disk_bytenr; | ||
4077 | u64 disk_num_bytes; | ||
4078 | u64 offset; | ||
4079 | u64 num_bytes; | ||
4080 | u8 compression; | ||
4081 | u8 encryption; | ||
4082 | u16 other_encoding; | ||
4083 | }; | ||
4084 | |||
4085 | static int is_cowonly_root(u64 root_objectid) | ||
4086 | { | ||
4087 | if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || | ||
4088 | root_objectid == BTRFS_EXTENT_TREE_OBJECTID || | ||
4089 | root_objectid == BTRFS_CHUNK_TREE_OBJECTID || | ||
4090 | root_objectid == BTRFS_DEV_TREE_OBJECTID || | ||
4091 | root_objectid == BTRFS_TREE_LOG_OBJECTID || | ||
4092 | root_objectid == BTRFS_CSUM_TREE_OBJECTID) | ||
4093 | return 1; | ||
4094 | return 0; | ||
4095 | } | ||
4096 | |||
4097 | static noinline int __next_ref_path(struct btrfs_trans_handle *trans, | ||
4098 | struct btrfs_root *extent_root, | ||
4099 | struct btrfs_ref_path *ref_path, | ||
4100 | int first_time) | ||
4101 | { | ||
4102 | struct extent_buffer *leaf; | ||
4103 | struct btrfs_path *path; | ||
4104 | struct btrfs_extent_ref *ref; | ||
4105 | struct btrfs_key key; | ||
4106 | struct btrfs_key found_key; | ||
4107 | u64 bytenr; | ||
4108 | u32 nritems; | ||
4109 | int level; | ||
4110 | int ret = 1; | ||
4111 | |||
4112 | path = btrfs_alloc_path(); | ||
4113 | if (!path) | ||
4114 | return -ENOMEM; | ||
4115 | |||
4116 | if (first_time) { | ||
4117 | ref_path->lowest_level = -1; | ||
4118 | ref_path->current_level = -1; | ||
4119 | ref_path->shared_level = -1; | ||
4120 | goto walk_up; | ||
4121 | } | ||
4122 | walk_down: | ||
4123 | level = ref_path->current_level - 1; | ||
4124 | while (level >= -1) { | ||
4125 | u64 parent; | ||
4126 | if (level < ref_path->lowest_level) | ||
4127 | break; | ||
4128 | |||
4129 | if (level >= 0) | ||
4130 | bytenr = ref_path->nodes[level]; | ||
4131 | else | ||
4132 | bytenr = ref_path->extent_start; | ||
4133 | BUG_ON(bytenr == 0); | ||
4134 | |||
4135 | parent = ref_path->nodes[level + 1]; | ||
4136 | ref_path->nodes[level + 1] = 0; | ||
4137 | ref_path->current_level = level; | ||
4138 | BUG_ON(parent == 0); | ||
4139 | |||
4140 | key.objectid = bytenr; | ||
4141 | key.offset = parent + 1; | ||
4142 | key.type = BTRFS_EXTENT_REF_KEY; | ||
4143 | |||
4144 | ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); | ||
4145 | if (ret < 0) | ||
4146 | goto out; | ||
4147 | BUG_ON(ret == 0); | ||
4148 | |||
4149 | leaf = path->nodes[0]; | ||
4150 | nritems = btrfs_header_nritems(leaf); | ||
4151 | if (path->slots[0] >= nritems) { | ||
4152 | ret = btrfs_next_leaf(extent_root, path); | ||
4153 | if (ret < 0) | ||
4154 | goto out; | ||
4155 | if (ret > 0) | ||
4156 | goto next; | ||
4157 | leaf = path->nodes[0]; | ||
4158 | } | ||
4159 | |||
4160 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
4161 | if (found_key.objectid == bytenr && | ||
4162 | found_key.type == BTRFS_EXTENT_REF_KEY) { | ||
4163 | if (level < ref_path->shared_level) | ||
4164 | ref_path->shared_level = level; | ||
4165 | goto found; | ||
4166 | } | ||
4167 | next: | ||
4168 | level--; | ||
4169 | btrfs_release_path(extent_root, path); | ||
4170 | cond_resched(); | ||
4171 | } | ||
4172 | /* reached lowest level */ | ||
4173 | ret = 1; | ||
4174 | goto out; | ||
4175 | walk_up: | ||
4176 | level = ref_path->current_level; | ||
4177 | while (level < BTRFS_MAX_LEVEL - 1) { | ||
4178 | u64 ref_objectid; | ||
4179 | |||
4180 | if (level >= 0) | ||
4181 | bytenr = ref_path->nodes[level]; | ||
4182 | else | ||
4183 | bytenr = ref_path->extent_start; | ||
4184 | |||
4185 | BUG_ON(bytenr == 0); | ||
4186 | |||
4187 | key.objectid = bytenr; | ||
4188 | key.offset = 0; | ||
4189 | key.type = BTRFS_EXTENT_REF_KEY; | ||
4190 | |||
4191 | ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); | ||
4192 | if (ret < 0) | ||
4193 | goto out; | ||
4194 | |||
4195 | leaf = path->nodes[0]; | ||
4196 | nritems = btrfs_header_nritems(leaf); | ||
4197 | if (path->slots[0] >= nritems) { | ||
4198 | ret = btrfs_next_leaf(extent_root, path); | ||
4199 | if (ret < 0) | ||
4200 | goto out; | ||
4201 | if (ret > 0) { | ||
4202 | /* the extent was freed by someone */ | ||
4203 | if (ref_path->lowest_level == level) | ||
4204 | goto out; | ||
4205 | btrfs_release_path(extent_root, path); | ||
4206 | goto walk_down; | ||
4207 | } | ||
4208 | leaf = path->nodes[0]; | ||
4209 | } | ||
4210 | |||
4211 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
4212 | if (found_key.objectid != bytenr || | ||
4213 | found_key.type != BTRFS_EXTENT_REF_KEY) { | ||
4214 | /* the extent was freed by someone */ | ||
4215 | if (ref_path->lowest_level == level) { | ||
4216 | ret = 1; | ||
4217 | goto out; | ||
4218 | } | ||
4219 | btrfs_release_path(extent_root, path); | ||
4220 | goto walk_down; | ||
4221 | } | ||
4222 | found: | ||
4223 | ref = btrfs_item_ptr(leaf, path->slots[0], | ||
4224 | struct btrfs_extent_ref); | ||
4225 | ref_objectid = btrfs_ref_objectid(leaf, ref); | ||
4226 | if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { | ||
4227 | if (first_time) { | ||
4228 | level = (int)ref_objectid; | ||
4229 | BUG_ON(level >= BTRFS_MAX_LEVEL); | ||
4230 | ref_path->lowest_level = level; | ||
4231 | ref_path->current_level = level; | ||
4232 | ref_path->nodes[level] = bytenr; | ||
4233 | } else { | ||
4234 | WARN_ON(ref_objectid != level); | ||
4235 | } | ||
4236 | } else { | ||
4237 | WARN_ON(level != -1); | ||
4238 | } | ||
4239 | first_time = 0; | ||
4240 | |||
4241 | if (ref_path->lowest_level == level) { | ||
4242 | ref_path->owner_objectid = ref_objectid; | ||
4243 | ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); | ||
4244 | } | ||
4245 | |||
4246 | /* | ||
4247 | * the block is tree root or the block isn't in reference | ||
4248 | * counted tree. | ||
4249 | */ | ||
4250 | if (found_key.objectid == found_key.offset || | ||
4251 | is_cowonly_root(btrfs_ref_root(leaf, ref))) { | ||
4252 | ref_path->root_objectid = btrfs_ref_root(leaf, ref); | ||
4253 | ref_path->root_generation = | ||
4254 | btrfs_ref_generation(leaf, ref); | ||
4255 | if (level < 0) { | ||
4256 | /* special reference from the tree log */ | ||
4257 | ref_path->nodes[0] = found_key.offset; | ||
4258 | ref_path->current_level = 0; | ||
4259 | } | ||
4260 | ret = 0; | ||
4261 | goto out; | ||
4262 | } | ||
4263 | |||
4264 | level++; | ||
4265 | BUG_ON(ref_path->nodes[level] != 0); | ||
4266 | ref_path->nodes[level] = found_key.offset; | ||
4267 | ref_path->current_level = level; | ||
4268 | |||
4269 | /* | ||
4270 | * the reference was created in the running transaction, | ||
4271 | * no need to continue walking up. | ||
4272 | */ | ||
4273 | if (btrfs_ref_generation(leaf, ref) == trans->transid) { | ||
4274 | ref_path->root_objectid = btrfs_ref_root(leaf, ref); | ||
4275 | ref_path->root_generation = | ||
4276 | btrfs_ref_generation(leaf, ref); | ||
4277 | ret = 0; | ||
4278 | goto out; | ||
4279 | } | ||
4280 | |||
4281 | btrfs_release_path(extent_root, path); | ||
4282 | cond_resched(); | ||
4283 | } | ||
4284 | /* reached max tree level, but no tree root found. */ | ||
4285 | BUG(); | ||
4286 | out: | ||
4287 | btrfs_free_path(path); | ||
4288 | return ret; | ||
4289 | } | ||
4290 | |||
4291 | static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, | ||
4292 | struct btrfs_root *extent_root, | ||
4293 | struct btrfs_ref_path *ref_path, | ||
4294 | u64 extent_start) | ||
4295 | { | ||
4296 | memset(ref_path, 0, sizeof(*ref_path)); | ||
4297 | ref_path->extent_start = extent_start; | ||
4298 | |||
4299 | return __next_ref_path(trans, extent_root, ref_path, 1); | ||
4300 | } | ||
4301 | |||
4302 | static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, | ||
4303 | struct btrfs_root *extent_root, | ||
4304 | struct btrfs_ref_path *ref_path) | ||
4305 | { | ||
4306 | return __next_ref_path(trans, extent_root, ref_path, 0); | ||
4307 | } | ||
4308 | |||
4309 | static noinline int get_new_locations(struct inode *reloc_inode, | ||
4310 | struct btrfs_key *extent_key, | ||
4311 | u64 offset, int no_fragment, | ||
4312 | struct disk_extent **extents, | ||
4313 | int *nr_extents) | ||
4314 | { | ||
4315 | struct btrfs_root *root = BTRFS_I(reloc_inode)->root; | ||
4316 | struct btrfs_path *path; | ||
4317 | struct btrfs_file_extent_item *fi; | ||
4318 | struct extent_buffer *leaf; | ||
4319 | struct disk_extent *exts = *extents; | ||
4320 | struct btrfs_key found_key; | ||
4321 | u64 cur_pos; | ||
4322 | u64 last_byte; | ||
4323 | u32 nritems; | ||
4324 | int nr = 0; | ||
4325 | int max = *nr_extents; | ||
4326 | int ret; | ||
4327 | |||
4328 | WARN_ON(!no_fragment && *extents); | ||
4329 | if (!exts) { | ||
4330 | max = 1; | ||
4331 | exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); | ||
4332 | if (!exts) | ||
4333 | return -ENOMEM; | ||
4334 | } | ||
4335 | |||
4336 | path = btrfs_alloc_path(); | ||
4337 | BUG_ON(!path); | ||
4338 | |||
4339 | cur_pos = extent_key->objectid - offset; | ||
4340 | last_byte = extent_key->objectid + extent_key->offset; | ||
4341 | ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, | ||
4342 | cur_pos, 0); | ||
4343 | if (ret < 0) | ||
4344 | goto out; | ||
4345 | if (ret > 0) { | ||
4346 | ret = -ENOENT; | ||
4347 | goto out; | ||
4348 | } | ||
4349 | |||
4350 | while (1) { | ||
4351 | leaf = path->nodes[0]; | ||
4352 | nritems = btrfs_header_nritems(leaf); | ||
4353 | if (path->slots[0] >= nritems) { | ||
4354 | ret = btrfs_next_leaf(root, path); | ||
4355 | if (ret < 0) | ||
4356 | goto out; | ||
4357 | if (ret > 0) | ||
4358 | break; | ||
4359 | leaf = path->nodes[0]; | ||
4360 | } | ||
4361 | |||
4362 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
4363 | if (found_key.offset != cur_pos || | ||
4364 | found_key.type != BTRFS_EXTENT_DATA_KEY || | ||
4365 | found_key.objectid != reloc_inode->i_ino) | ||
4366 | break; | ||
4367 | |||
4368 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
4369 | struct btrfs_file_extent_item); | ||
4370 | if (btrfs_file_extent_type(leaf, fi) != | ||
4371 | BTRFS_FILE_EXTENT_REG || | ||
4372 | btrfs_file_extent_disk_bytenr(leaf, fi) == 0) | ||
4373 | break; | ||
4374 | |||
4375 | if (nr == max) { | ||
4376 | struct disk_extent *old = exts; | ||
4377 | max *= 2; | ||
4378 | exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); | ||
4379 | memcpy(exts, old, sizeof(*exts) * nr); | ||
4380 | if (old != *extents) | ||
4381 | kfree(old); | ||
4382 | } | ||
4383 | |||
4384 | exts[nr].disk_bytenr = | ||
4385 | btrfs_file_extent_disk_bytenr(leaf, fi); | ||
4386 | exts[nr].disk_num_bytes = | ||
4387 | btrfs_file_extent_disk_num_bytes(leaf, fi); | ||
4388 | exts[nr].offset = btrfs_file_extent_offset(leaf, fi); | ||
4389 | exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); | ||
4390 | exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); | ||
4391 | exts[nr].compression = btrfs_file_extent_compression(leaf, fi); | ||
4392 | exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); | ||
4393 | exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, | ||
4394 | fi); | ||
4395 | BUG_ON(exts[nr].offset > 0); | ||
4396 | BUG_ON(exts[nr].compression || exts[nr].encryption); | ||
4397 | BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); | ||
4398 | |||
4399 | cur_pos += exts[nr].num_bytes; | ||
4400 | nr++; | ||
4401 | |||
4402 | if (cur_pos + offset >= last_byte) | ||
4403 | break; | ||
4404 | |||
4405 | if (no_fragment) { | ||
4406 | ret = 1; | ||
4407 | goto out; | ||
4408 | } | ||
4409 | path->slots[0]++; | ||
4410 | } | ||
4411 | |||
4412 | BUG_ON(cur_pos + offset > last_byte); | ||
4413 | if (cur_pos + offset < last_byte) { | ||
4414 | ret = -ENOENT; | ||
4415 | goto out; | ||
4416 | } | ||
4417 | ret = 0; | ||
4418 | out: | ||
4419 | btrfs_free_path(path); | ||
4420 | if (ret) { | ||
4421 | if (exts != *extents) | ||
4422 | kfree(exts); | ||
4423 | } else { | ||
4424 | *extents = exts; | ||
4425 | *nr_extents = nr; | ||
4426 | } | ||
4427 | return ret; | ||
4428 | } | ||
4429 | |||
4430 | static noinline int replace_one_extent(struct btrfs_trans_handle *trans, | ||
4431 | struct btrfs_root *root, | ||
4432 | struct btrfs_path *path, | ||
4433 | struct btrfs_key *extent_key, | ||
4434 | struct btrfs_key *leaf_key, | ||
4435 | struct btrfs_ref_path *ref_path, | ||
4436 | struct disk_extent *new_extents, | ||
4437 | int nr_extents) | ||
4438 | { | ||
4439 | struct extent_buffer *leaf; | ||
4440 | struct btrfs_file_extent_item *fi; | ||
4441 | struct inode *inode = NULL; | ||
4442 | struct btrfs_key key; | ||
4443 | u64 lock_start = 0; | ||
4444 | u64 lock_end = 0; | ||
4445 | u64 num_bytes; | ||
4446 | u64 ext_offset; | ||
4447 | u64 first_pos; | ||
4448 | u32 nritems; | ||
4449 | int nr_scaned = 0; | ||
4450 | int extent_locked = 0; | ||
4451 | int extent_type; | ||
4452 | int ret; | ||
4453 | |||
4454 | memcpy(&key, leaf_key, sizeof(key)); | ||
4455 | first_pos = INT_LIMIT(loff_t) - extent_key->offset; | ||
4456 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { | ||
4457 | if (key.objectid < ref_path->owner_objectid || | ||
4458 | (key.objectid == ref_path->owner_objectid && | ||
4459 | key.type < BTRFS_EXTENT_DATA_KEY)) { | ||
4460 | key.objectid = ref_path->owner_objectid; | ||
4461 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
4462 | key.offset = 0; | ||
4463 | } | ||
4464 | } | ||
4465 | |||
4466 | while (1) { | ||
4467 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | ||
4468 | if (ret < 0) | ||
4469 | goto out; | ||
4470 | |||
4471 | leaf = path->nodes[0]; | ||
4472 | nritems = btrfs_header_nritems(leaf); | ||
4473 | next: | ||
4474 | if (extent_locked && ret > 0) { | ||
4475 | /* | ||
4476 | * the file extent item was modified by someone | ||
4477 | * before the extent got locked. | ||
4478 | */ | ||
4479 | unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, | ||
4480 | lock_end, GFP_NOFS); | ||
4481 | extent_locked = 0; | ||
4482 | } | ||
4483 | |||
4484 | if (path->slots[0] >= nritems) { | ||
4485 | if (++nr_scaned > 2) | ||
4486 | break; | ||
4487 | |||
4488 | BUG_ON(extent_locked); | ||
4489 | ret = btrfs_next_leaf(root, path); | ||
4490 | if (ret < 0) | ||
4491 | goto out; | ||
4492 | if (ret > 0) | ||
4493 | break; | ||
4494 | leaf = path->nodes[0]; | ||
4495 | nritems = btrfs_header_nritems(leaf); | ||
4496 | } | ||
4497 | |||
4498 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
4499 | |||
4500 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { | ||
4501 | if ((key.objectid > ref_path->owner_objectid) || | ||
4502 | (key.objectid == ref_path->owner_objectid && | ||
4503 | key.type > BTRFS_EXTENT_DATA_KEY) || | ||
4504 | (key.offset >= first_pos + extent_key->offset)) | ||
4505 | break; | ||
4506 | } | ||
4507 | |||
4508 | if (inode && key.objectid != inode->i_ino) { | ||
4509 | BUG_ON(extent_locked); | ||
4510 | btrfs_release_path(root, path); | ||
4511 | mutex_unlock(&inode->i_mutex); | ||
4512 | iput(inode); | ||
4513 | inode = NULL; | ||
4514 | continue; | ||
4515 | } | ||
4516 | |||
4517 | if (key.type != BTRFS_EXTENT_DATA_KEY) { | ||
4518 | path->slots[0]++; | ||
4519 | ret = 1; | ||
4520 | goto next; | ||
4521 | } | ||
4522 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
4523 | struct btrfs_file_extent_item); | ||
4524 | extent_type = btrfs_file_extent_type(leaf, fi); | ||
4525 | if ((extent_type != BTRFS_FILE_EXTENT_REG && | ||
4526 | extent_type != BTRFS_FILE_EXTENT_PREALLOC) || | ||
4527 | (btrfs_file_extent_disk_bytenr(leaf, fi) != | ||
4528 | extent_key->objectid)) { | ||
4529 | path->slots[0]++; | ||
4530 | ret = 1; | ||
4531 | goto next; | ||
4532 | } | ||
4533 | |||
4534 | num_bytes = btrfs_file_extent_num_bytes(leaf, fi); | ||
4535 | ext_offset = btrfs_file_extent_offset(leaf, fi); | ||
4536 | |||
4537 | if (first_pos > key.offset - ext_offset) | ||
4538 | first_pos = key.offset - ext_offset; | ||
4539 | |||
4540 | if (!extent_locked) { | ||
4541 | lock_start = key.offset; | ||
4542 | lock_end = lock_start + num_bytes - 1; | ||
4543 | } else { | ||
4544 | if (lock_start > key.offset || | ||
4545 | lock_end + 1 < key.offset + num_bytes) { | ||
4546 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
4547 | lock_start, lock_end, GFP_NOFS); | ||
4548 | extent_locked = 0; | ||
4549 | } | ||
4550 | } | ||
4551 | |||
4552 | if (!inode) { | ||
4553 | btrfs_release_path(root, path); | ||
4554 | |||
4555 | inode = btrfs_iget_locked(root->fs_info->sb, | ||
4556 | key.objectid, root); | ||
4557 | if (inode->i_state & I_NEW) { | ||
4558 | BTRFS_I(inode)->root = root; | ||
4559 | BTRFS_I(inode)->location.objectid = | ||
4560 | key.objectid; | ||
4561 | BTRFS_I(inode)->location.type = | ||
4562 | BTRFS_INODE_ITEM_KEY; | ||
4563 | BTRFS_I(inode)->location.offset = 0; | ||
4564 | btrfs_read_locked_inode(inode); | ||
4565 | unlock_new_inode(inode); | ||
4566 | } | ||
4567 | /* | ||
4568 | * some code call btrfs_commit_transaction while | ||
4569 | * holding the i_mutex, so we can't use mutex_lock | ||
4570 | * here. | ||
4571 | */ | ||
4572 | if (is_bad_inode(inode) || | ||
4573 | !mutex_trylock(&inode->i_mutex)) { | ||
4574 | iput(inode); | ||
4575 | inode = NULL; | ||
4576 | key.offset = (u64)-1; | ||
4577 | goto skip; | ||
4578 | } | ||
4579 | } | ||
4580 | |||
4581 | if (!extent_locked) { | ||
4582 | struct btrfs_ordered_extent *ordered; | ||
4583 | |||
4584 | btrfs_release_path(root, path); | ||
4585 | |||
4586 | lock_extent(&BTRFS_I(inode)->io_tree, lock_start, | ||
4587 | lock_end, GFP_NOFS); | ||
4588 | ordered = btrfs_lookup_first_ordered_extent(inode, | ||
4589 | lock_end); | ||
4590 | if (ordered && | ||
4591 | ordered->file_offset <= lock_end && | ||
4592 | ordered->file_offset + ordered->len > lock_start) { | ||
4593 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
4594 | lock_start, lock_end, GFP_NOFS); | ||
4595 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
4596 | btrfs_put_ordered_extent(ordered); | ||
4597 | key.offset += num_bytes; | ||
4598 | goto skip; | ||
4599 | } | ||
4600 | if (ordered) | ||
4601 | btrfs_put_ordered_extent(ordered); | ||
4602 | |||
4603 | extent_locked = 1; | ||
4604 | continue; | ||
4605 | } | ||
4606 | |||
4607 | if (nr_extents == 1) { | ||
4608 | /* update extent pointer in place */ | ||
4609 | btrfs_set_file_extent_disk_bytenr(leaf, fi, | ||
4610 | new_extents[0].disk_bytenr); | ||
4611 | btrfs_set_file_extent_disk_num_bytes(leaf, fi, | ||
4612 | new_extents[0].disk_num_bytes); | ||
4613 | btrfs_mark_buffer_dirty(leaf); | ||
4614 | |||
4615 | btrfs_drop_extent_cache(inode, key.offset, | ||
4616 | key.offset + num_bytes - 1, 0); | ||
4617 | |||
4618 | ret = btrfs_inc_extent_ref(trans, root, | ||
4619 | new_extents[0].disk_bytenr, | ||
4620 | new_extents[0].disk_num_bytes, | ||
4621 | leaf->start, | ||
4622 | root->root_key.objectid, | ||
4623 | trans->transid, | ||
4624 | key.objectid); | ||
4625 | BUG_ON(ret); | ||
4626 | |||
4627 | ret = btrfs_free_extent(trans, root, | ||
4628 | extent_key->objectid, | ||
4629 | extent_key->offset, | ||
4630 | leaf->start, | ||
4631 | btrfs_header_owner(leaf), | ||
4632 | btrfs_header_generation(leaf), | ||
4633 | key.objectid, 0); | ||
4634 | BUG_ON(ret); | ||
4635 | |||
4636 | btrfs_release_path(root, path); | ||
4637 | key.offset += num_bytes; | ||
4638 | } else { | ||
4639 | BUG_ON(1); | ||
4640 | #if 0 | ||
4641 | u64 alloc_hint; | ||
4642 | u64 extent_len; | ||
4643 | int i; | ||
4644 | /* | ||
4645 | * drop old extent pointer at first, then insert the | ||
4646 | * new pointers one bye one | ||
4647 | */ | ||
4648 | btrfs_release_path(root, path); | ||
4649 | ret = btrfs_drop_extents(trans, root, inode, key.offset, | ||
4650 | key.offset + num_bytes, | ||
4651 | key.offset, &alloc_hint); | ||
4652 | BUG_ON(ret); | ||
4653 | |||
4654 | for (i = 0; i < nr_extents; i++) { | ||
4655 | if (ext_offset >= new_extents[i].num_bytes) { | ||
4656 | ext_offset -= new_extents[i].num_bytes; | ||
4657 | continue; | ||
4658 | } | ||
4659 | extent_len = min(new_extents[i].num_bytes - | ||
4660 | ext_offset, num_bytes); | ||
4661 | |||
4662 | ret = btrfs_insert_empty_item(trans, root, | ||
4663 | path, &key, | ||
4664 | sizeof(*fi)); | ||
4665 | BUG_ON(ret); | ||
4666 | |||
4667 | leaf = path->nodes[0]; | ||
4668 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
4669 | struct btrfs_file_extent_item); | ||
4670 | btrfs_set_file_extent_generation(leaf, fi, | ||
4671 | trans->transid); | ||
4672 | btrfs_set_file_extent_type(leaf, fi, | ||
4673 | BTRFS_FILE_EXTENT_REG); | ||
4674 | btrfs_set_file_extent_disk_bytenr(leaf, fi, | ||
4675 | new_extents[i].disk_bytenr); | ||
4676 | btrfs_set_file_extent_disk_num_bytes(leaf, fi, | ||
4677 | new_extents[i].disk_num_bytes); | ||
4678 | btrfs_set_file_extent_ram_bytes(leaf, fi, | ||
4679 | new_extents[i].ram_bytes); | ||
4680 | |||
4681 | btrfs_set_file_extent_compression(leaf, fi, | ||
4682 | new_extents[i].compression); | ||
4683 | btrfs_set_file_extent_encryption(leaf, fi, | ||
4684 | new_extents[i].encryption); | ||
4685 | btrfs_set_file_extent_other_encoding(leaf, fi, | ||
4686 | new_extents[i].other_encoding); | ||
4687 | |||
4688 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
4689 | extent_len); | ||
4690 | ext_offset += new_extents[i].offset; | ||
4691 | btrfs_set_file_extent_offset(leaf, fi, | ||
4692 | ext_offset); | ||
4693 | btrfs_mark_buffer_dirty(leaf); | ||
4694 | |||
4695 | btrfs_drop_extent_cache(inode, key.offset, | ||
4696 | key.offset + extent_len - 1, 0); | ||
4697 | |||
4698 | ret = btrfs_inc_extent_ref(trans, root, | ||
4699 | new_extents[i].disk_bytenr, | ||
4700 | new_extents[i].disk_num_bytes, | ||
4701 | leaf->start, | ||
4702 | root->root_key.objectid, | ||
4703 | trans->transid, key.objectid); | ||
4704 | BUG_ON(ret); | ||
4705 | btrfs_release_path(root, path); | ||
4706 | |||
4707 | inode_add_bytes(inode, extent_len); | ||
4708 | |||
4709 | ext_offset = 0; | ||
4710 | num_bytes -= extent_len; | ||
4711 | key.offset += extent_len; | ||
4712 | |||
4713 | if (num_bytes == 0) | ||
4714 | break; | ||
4715 | } | ||
4716 | BUG_ON(i >= nr_extents); | ||
4717 | #endif | ||
4718 | } | ||
4719 | |||
4720 | if (extent_locked) { | ||
4721 | unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, | ||
4722 | lock_end, GFP_NOFS); | ||
4723 | extent_locked = 0; | ||
4724 | } | ||
4725 | skip: | ||
4726 | if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && | ||
4727 | key.offset >= first_pos + extent_key->offset) | ||
4728 | break; | ||
4729 | |||
4730 | cond_resched(); | ||
4731 | } | ||
4732 | ret = 0; | ||
4733 | out: | ||
4734 | btrfs_release_path(root, path); | ||
4735 | if (inode) { | ||
4736 | mutex_unlock(&inode->i_mutex); | ||
4737 | if (extent_locked) { | ||
4738 | unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, | ||
4739 | lock_end, GFP_NOFS); | ||
4740 | } | ||
4741 | iput(inode); | ||
4742 | } | ||
4743 | return ret; | ||
4744 | } | ||
4745 | |||
4746 | int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, | ||
4747 | struct btrfs_root *root, | ||
4748 | struct extent_buffer *buf, u64 orig_start) | ||
4749 | { | ||
4750 | int level; | ||
4751 | int ret; | ||
4752 | |||
4753 | BUG_ON(btrfs_header_generation(buf) != trans->transid); | ||
4754 | BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); | ||
4755 | |||
4756 | level = btrfs_header_level(buf); | ||
4757 | if (level == 0) { | ||
4758 | struct btrfs_leaf_ref *ref; | ||
4759 | struct btrfs_leaf_ref *orig_ref; | ||
4760 | |||
4761 | orig_ref = btrfs_lookup_leaf_ref(root, orig_start); | ||
4762 | if (!orig_ref) | ||
4763 | return -ENOENT; | ||
4764 | |||
4765 | ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); | ||
4766 | if (!ref) { | ||
4767 | btrfs_free_leaf_ref(root, orig_ref); | ||
4768 | return -ENOMEM; | ||
4769 | } | ||
4770 | |||
4771 | ref->nritems = orig_ref->nritems; | ||
4772 | memcpy(ref->extents, orig_ref->extents, | ||
4773 | sizeof(ref->extents[0]) * ref->nritems); | ||
4774 | |||
4775 | btrfs_free_leaf_ref(root, orig_ref); | ||
4776 | |||
4777 | ref->root_gen = trans->transid; | ||
4778 | ref->bytenr = buf->start; | ||
4779 | ref->owner = btrfs_header_owner(buf); | ||
4780 | ref->generation = btrfs_header_generation(buf); | ||
4781 | ret = btrfs_add_leaf_ref(root, ref, 0); | ||
4782 | WARN_ON(ret); | ||
4783 | btrfs_free_leaf_ref(root, ref); | ||
4784 | } | ||
4785 | return 0; | ||
4786 | } | ||
4787 | |||
4788 | static noinline int invalidate_extent_cache(struct btrfs_root *root, | ||
4789 | struct extent_buffer *leaf, | ||
4790 | struct btrfs_block_group_cache *group, | ||
4791 | struct btrfs_root *target_root) | ||
4792 | { | ||
4793 | struct btrfs_key key; | ||
4794 | struct inode *inode = NULL; | ||
4795 | struct btrfs_file_extent_item *fi; | ||
4796 | u64 num_bytes; | ||
4797 | u64 skip_objectid = 0; | ||
4798 | u32 nritems; | ||
4799 | u32 i; | ||
4800 | |||
4801 | nritems = btrfs_header_nritems(leaf); | ||
4802 | for (i = 0; i < nritems; i++) { | ||
4803 | btrfs_item_key_to_cpu(leaf, &key, i); | ||
4804 | if (key.objectid == skip_objectid || | ||
4805 | key.type != BTRFS_EXTENT_DATA_KEY) | ||
4806 | continue; | ||
4807 | fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); | ||
4808 | if (btrfs_file_extent_type(leaf, fi) == | ||
4809 | BTRFS_FILE_EXTENT_INLINE) | ||
4810 | continue; | ||
4811 | if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) | ||
4812 | continue; | ||
4813 | if (!inode || inode->i_ino != key.objectid) { | ||
4814 | iput(inode); | ||
4815 | inode = btrfs_ilookup(target_root->fs_info->sb, | ||
4816 | key.objectid, target_root, 1); | ||
4817 | } | ||
4818 | if (!inode) { | ||
4819 | skip_objectid = key.objectid; | ||
4820 | continue; | ||
4821 | } | ||
4822 | num_bytes = btrfs_file_extent_num_bytes(leaf, fi); | ||
4823 | |||
4824 | lock_extent(&BTRFS_I(inode)->io_tree, key.offset, | ||
4825 | key.offset + num_bytes - 1, GFP_NOFS); | ||
4826 | btrfs_drop_extent_cache(inode, key.offset, | ||
4827 | key.offset + num_bytes - 1, 1); | ||
4828 | unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, | ||
4829 | key.offset + num_bytes - 1, GFP_NOFS); | ||
4830 | cond_resched(); | ||
4831 | } | ||
4832 | iput(inode); | ||
4833 | return 0; | ||
4834 | } | ||
4835 | |||
4836 | static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, | ||
4837 | struct btrfs_root *root, | ||
4838 | struct extent_buffer *leaf, | ||
4839 | struct btrfs_block_group_cache *group, | ||
4840 | struct inode *reloc_inode) | ||
4841 | { | ||
4842 | struct btrfs_key key; | ||
4843 | struct btrfs_key extent_key; | ||
4844 | struct btrfs_file_extent_item *fi; | ||
4845 | struct btrfs_leaf_ref *ref; | ||
4846 | struct disk_extent *new_extent; | ||
4847 | u64 bytenr; | ||
4848 | u64 num_bytes; | ||
4849 | u32 nritems; | ||
4850 | u32 i; | ||
4851 | int ext_index; | ||
4852 | int nr_extent; | ||
4853 | int ret; | ||
4854 | |||
4855 | new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); | ||
4856 | BUG_ON(!new_extent); | ||
4857 | |||
4858 | ref = btrfs_lookup_leaf_ref(root, leaf->start); | ||
4859 | BUG_ON(!ref); | ||
4860 | |||
4861 | ext_index = -1; | ||
4862 | nritems = btrfs_header_nritems(leaf); | ||
4863 | for (i = 0; i < nritems; i++) { | ||
4864 | btrfs_item_key_to_cpu(leaf, &key, i); | ||
4865 | if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) | ||
4866 | continue; | ||
4867 | fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); | ||
4868 | if (btrfs_file_extent_type(leaf, fi) == | ||
4869 | BTRFS_FILE_EXTENT_INLINE) | ||
4870 | continue; | ||
4871 | bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); | ||
4872 | num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); | ||
4873 | if (bytenr == 0) | ||
4874 | continue; | ||
4875 | |||
4876 | ext_index++; | ||
4877 | if (bytenr >= group->key.objectid + group->key.offset || | ||
4878 | bytenr + num_bytes <= group->key.objectid) | ||
4879 | continue; | ||
4880 | |||
4881 | extent_key.objectid = bytenr; | ||
4882 | extent_key.offset = num_bytes; | ||
4883 | extent_key.type = BTRFS_EXTENT_ITEM_KEY; | ||
4884 | nr_extent = 1; | ||
4885 | ret = get_new_locations(reloc_inode, &extent_key, | ||
4886 | group->key.objectid, 1, | ||
4887 | &new_extent, &nr_extent); | ||
4888 | if (ret > 0) | ||
4889 | continue; | ||
4890 | BUG_ON(ret < 0); | ||
4891 | |||
4892 | BUG_ON(ref->extents[ext_index].bytenr != bytenr); | ||
4893 | BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); | ||
4894 | ref->extents[ext_index].bytenr = new_extent->disk_bytenr; | ||
4895 | ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; | ||
4896 | |||
4897 | btrfs_set_file_extent_disk_bytenr(leaf, fi, | ||
4898 | new_extent->disk_bytenr); | ||
4899 | btrfs_set_file_extent_disk_num_bytes(leaf, fi, | ||
4900 | new_extent->disk_num_bytes); | ||
4901 | btrfs_mark_buffer_dirty(leaf); | ||
4902 | |||
4903 | ret = btrfs_inc_extent_ref(trans, root, | ||
4904 | new_extent->disk_bytenr, | ||
4905 | new_extent->disk_num_bytes, | ||
4906 | leaf->start, | ||
4907 | root->root_key.objectid, | ||
4908 | trans->transid, key.objectid); | ||
4909 | BUG_ON(ret); | ||
4910 | ret = btrfs_free_extent(trans, root, | ||
4911 | bytenr, num_bytes, leaf->start, | ||
4912 | btrfs_header_owner(leaf), | ||
4913 | btrfs_header_generation(leaf), | ||
4914 | key.objectid, 0); | ||
4915 | BUG_ON(ret); | ||
4916 | cond_resched(); | ||
4917 | } | ||
4918 | kfree(new_extent); | ||
4919 | BUG_ON(ext_index + 1 != ref->nritems); | ||
4920 | btrfs_free_leaf_ref(root, ref); | ||
4921 | return 0; | ||
4922 | } | ||
4923 | |||
4924 | int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, | ||
4925 | struct btrfs_root *root) | ||
4926 | { | ||
4927 | struct btrfs_root *reloc_root; | ||
4928 | int ret; | ||
4929 | |||
4930 | if (root->reloc_root) { | ||
4931 | reloc_root = root->reloc_root; | ||
4932 | root->reloc_root = NULL; | ||
4933 | list_add(&reloc_root->dead_list, | ||
4934 | &root->fs_info->dead_reloc_roots); | ||
4935 | |||
4936 | btrfs_set_root_bytenr(&reloc_root->root_item, | ||
4937 | reloc_root->node->start); | ||
4938 | btrfs_set_root_level(&root->root_item, | ||
4939 | btrfs_header_level(reloc_root->node)); | ||
4940 | memset(&reloc_root->root_item.drop_progress, 0, | ||
4941 | sizeof(struct btrfs_disk_key)); | ||
4942 | reloc_root->root_item.drop_level = 0; | ||
4943 | |||
4944 | ret = btrfs_update_root(trans, root->fs_info->tree_root, | ||
4945 | &reloc_root->root_key, | ||
4946 | &reloc_root->root_item); | ||
4947 | BUG_ON(ret); | ||
4948 | } | ||
4949 | return 0; | ||
4950 | } | ||
4951 | |||
4952 | int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) | ||
4953 | { | ||
4954 | struct btrfs_trans_handle *trans; | ||
4955 | struct btrfs_root *reloc_root; | ||
4956 | struct btrfs_root *prev_root = NULL; | ||
4957 | struct list_head dead_roots; | ||
4958 | int ret; | ||
4959 | unsigned long nr; | ||
4960 | |||
4961 | INIT_LIST_HEAD(&dead_roots); | ||
4962 | list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); | ||
4963 | |||
4964 | while (!list_empty(&dead_roots)) { | ||
4965 | reloc_root = list_entry(dead_roots.prev, | ||
4966 | struct btrfs_root, dead_list); | ||
4967 | list_del_init(&reloc_root->dead_list); | ||
4968 | |||
4969 | BUG_ON(reloc_root->commit_root != NULL); | ||
4970 | while (1) { | ||
4971 | trans = btrfs_join_transaction(root, 1); | ||
4972 | BUG_ON(!trans); | ||
4973 | |||
4974 | mutex_lock(&root->fs_info->drop_mutex); | ||
4975 | ret = btrfs_drop_snapshot(trans, reloc_root); | ||
4976 | if (ret != -EAGAIN) | ||
4977 | break; | ||
4978 | mutex_unlock(&root->fs_info->drop_mutex); | ||
4979 | |||
4980 | nr = trans->blocks_used; | ||
4981 | ret = btrfs_end_transaction(trans, root); | ||
4982 | BUG_ON(ret); | ||
4983 | btrfs_btree_balance_dirty(root, nr); | ||
4984 | } | ||
4985 | |||
4986 | free_extent_buffer(reloc_root->node); | ||
4987 | |||
4988 | ret = btrfs_del_root(trans, root->fs_info->tree_root, | ||
4989 | &reloc_root->root_key); | ||
4990 | BUG_ON(ret); | ||
4991 | mutex_unlock(&root->fs_info->drop_mutex); | ||
4992 | |||
4993 | nr = trans->blocks_used; | ||
4994 | ret = btrfs_end_transaction(trans, root); | ||
4995 | BUG_ON(ret); | ||
4996 | btrfs_btree_balance_dirty(root, nr); | ||
4997 | |||
4998 | kfree(prev_root); | ||
4999 | prev_root = reloc_root; | ||
5000 | } | ||
5001 | if (prev_root) { | ||
5002 | btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); | ||
5003 | kfree(prev_root); | ||
5004 | } | ||
5005 | return 0; | ||
5006 | } | ||
5007 | |||
5008 | int btrfs_add_dead_reloc_root(struct btrfs_root *root) | ||
5009 | { | ||
5010 | list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); | ||
5011 | return 0; | ||
5012 | } | ||
5013 | |||
5014 | int btrfs_cleanup_reloc_trees(struct btrfs_root *root) | ||
5015 | { | ||
5016 | struct btrfs_root *reloc_root; | ||
5017 | struct btrfs_trans_handle *trans; | ||
5018 | struct btrfs_key location; | ||
5019 | int found; | ||
5020 | int ret; | ||
5021 | |||
5022 | mutex_lock(&root->fs_info->tree_reloc_mutex); | ||
5023 | ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); | ||
5024 | BUG_ON(ret); | ||
5025 | found = !list_empty(&root->fs_info->dead_reloc_roots); | ||
5026 | mutex_unlock(&root->fs_info->tree_reloc_mutex); | ||
5027 | |||
5028 | if (found) { | ||
5029 | trans = btrfs_start_transaction(root, 1); | ||
5030 | BUG_ON(!trans); | ||
5031 | ret = btrfs_commit_transaction(trans, root); | ||
5032 | BUG_ON(ret); | ||
5033 | } | ||
5034 | |||
5035 | location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; | ||
5036 | location.offset = (u64)-1; | ||
5037 | location.type = BTRFS_ROOT_ITEM_KEY; | ||
5038 | |||
5039 | reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); | ||
5040 | BUG_ON(!reloc_root); | ||
5041 | btrfs_orphan_cleanup(reloc_root); | ||
5042 | return 0; | ||
5043 | } | ||
5044 | |||
5045 | static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, | ||
5046 | struct btrfs_root *root) | ||
5047 | { | ||
5048 | struct btrfs_root *reloc_root; | ||
5049 | struct extent_buffer *eb; | ||
5050 | struct btrfs_root_item *root_item; | ||
5051 | struct btrfs_key root_key; | ||
5052 | int ret; | ||
5053 | |||
5054 | BUG_ON(!root->ref_cows); | ||
5055 | if (root->reloc_root) | ||
5056 | return 0; | ||
5057 | |||
5058 | root_item = kmalloc(sizeof(*root_item), GFP_NOFS); | ||
5059 | BUG_ON(!root_item); | ||
5060 | |||
5061 | ret = btrfs_copy_root(trans, root, root->commit_root, | ||
5062 | &eb, BTRFS_TREE_RELOC_OBJECTID); | ||
5063 | BUG_ON(ret); | ||
5064 | |||
5065 | root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; | ||
5066 | root_key.offset = root->root_key.objectid; | ||
5067 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
5068 | |||
5069 | memcpy(root_item, &root->root_item, sizeof(root_item)); | ||
5070 | btrfs_set_root_refs(root_item, 0); | ||
5071 | btrfs_set_root_bytenr(root_item, eb->start); | ||
5072 | btrfs_set_root_level(root_item, btrfs_header_level(eb)); | ||
5073 | btrfs_set_root_generation(root_item, trans->transid); | ||
5074 | |||
5075 | btrfs_tree_unlock(eb); | ||
5076 | free_extent_buffer(eb); | ||
5077 | |||
5078 | ret = btrfs_insert_root(trans, root->fs_info->tree_root, | ||
5079 | &root_key, root_item); | ||
5080 | BUG_ON(ret); | ||
5081 | kfree(root_item); | ||
5082 | |||
5083 | reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, | ||
5084 | &root_key); | ||
5085 | BUG_ON(!reloc_root); | ||
5086 | reloc_root->last_trans = trans->transid; | ||
5087 | reloc_root->commit_root = NULL; | ||
5088 | reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; | ||
5089 | |||
5090 | root->reloc_root = reloc_root; | ||
5091 | return 0; | ||
5092 | } | ||
5093 | |||
5094 | /* | ||
5095 | * Core function of space balance. | ||
5096 | * | ||
5097 | * The idea is using reloc trees to relocate tree blocks in reference | ||
5098 | * counted roots. There is one reloc tree for each subvol, and all | ||
5099 | * reloc trees share same root key objectid. Reloc trees are snapshots | ||
5100 | * of the latest committed roots of subvols (root->commit_root). | ||
5101 | * | ||
5102 | * To relocate a tree block referenced by a subvol, there are two steps. | ||
5103 | * COW the block through subvol's reloc tree, then update block pointer | ||
5104 | * in the subvol to point to the new block. Since all reloc trees share | ||
5105 | * same root key objectid, doing special handing for tree blocks owned | ||
5106 | * by them is easy. Once a tree block has been COWed in one reloc tree, | ||
5107 | * we can use the resulting new block directly when the same block is | ||
5108 | * required to COW again through other reloc trees. By this way, relocated | ||
5109 | * tree blocks are shared between reloc trees, so they are also shared | ||
5110 | * between subvols. | ||
5111 | */ | ||
5112 | static noinline int relocate_one_path(struct btrfs_trans_handle *trans, | ||
5113 | struct btrfs_root *root, | ||
5114 | struct btrfs_path *path, | ||
5115 | struct btrfs_key *first_key, | ||
5116 | struct btrfs_ref_path *ref_path, | ||
5117 | struct btrfs_block_group_cache *group, | ||
5118 | struct inode *reloc_inode) | ||
5119 | { | ||
5120 | struct btrfs_root *reloc_root; | ||
5121 | struct extent_buffer *eb = NULL; | ||
5122 | struct btrfs_key *keys; | ||
5123 | u64 *nodes; | ||
5124 | int level; | ||
5125 | int shared_level; | ||
5126 | int lowest_level = 0; | ||
5127 | int ret; | ||
5128 | |||
5129 | if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) | ||
5130 | lowest_level = ref_path->owner_objectid; | ||
5131 | |||
5132 | if (!root->ref_cows) { | ||
5133 | path->lowest_level = lowest_level; | ||
5134 | ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); | ||
5135 | BUG_ON(ret < 0); | ||
5136 | path->lowest_level = 0; | ||
5137 | btrfs_release_path(root, path); | ||
5138 | return 0; | ||
5139 | } | ||
5140 | |||
5141 | mutex_lock(&root->fs_info->tree_reloc_mutex); | ||
5142 | ret = init_reloc_tree(trans, root); | ||
5143 | BUG_ON(ret); | ||
5144 | reloc_root = root->reloc_root; | ||
5145 | |||
5146 | shared_level = ref_path->shared_level; | ||
5147 | ref_path->shared_level = BTRFS_MAX_LEVEL - 1; | ||
5148 | |||
5149 | keys = ref_path->node_keys; | ||
5150 | nodes = ref_path->new_nodes; | ||
5151 | memset(&keys[shared_level + 1], 0, | ||
5152 | sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); | ||
5153 | memset(&nodes[shared_level + 1], 0, | ||
5154 | sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); | ||
5155 | |||
5156 | if (nodes[lowest_level] == 0) { | ||
5157 | path->lowest_level = lowest_level; | ||
5158 | ret = btrfs_search_slot(trans, reloc_root, first_key, path, | ||
5159 | 0, 1); | ||
5160 | BUG_ON(ret); | ||
5161 | for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { | ||
5162 | eb = path->nodes[level]; | ||
5163 | if (!eb || eb == reloc_root->node) | ||
5164 | break; | ||
5165 | nodes[level] = eb->start; | ||
5166 | if (level == 0) | ||
5167 | btrfs_item_key_to_cpu(eb, &keys[level], 0); | ||
5168 | else | ||
5169 | btrfs_node_key_to_cpu(eb, &keys[level], 0); | ||
5170 | } | ||
5171 | if (nodes[0] && | ||
5172 | ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { | ||
5173 | eb = path->nodes[0]; | ||
5174 | ret = replace_extents_in_leaf(trans, reloc_root, eb, | ||
5175 | group, reloc_inode); | ||
5176 | BUG_ON(ret); | ||
5177 | } | ||
5178 | btrfs_release_path(reloc_root, path); | ||
5179 | } else { | ||
5180 | ret = btrfs_merge_path(trans, reloc_root, keys, nodes, | ||
5181 | lowest_level); | ||
5182 | BUG_ON(ret); | ||
5183 | } | ||
5184 | |||
5185 | /* | ||
5186 | * replace tree blocks in the fs tree with tree blocks in | ||
5187 | * the reloc tree. | ||
5188 | */ | ||
5189 | ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); | ||
5190 | BUG_ON(ret < 0); | ||
5191 | |||
5192 | if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { | ||
5193 | ret = btrfs_search_slot(trans, reloc_root, first_key, path, | ||
5194 | 0, 0); | ||
5195 | BUG_ON(ret); | ||
5196 | extent_buffer_get(path->nodes[0]); | ||
5197 | eb = path->nodes[0]; | ||
5198 | btrfs_release_path(reloc_root, path); | ||
5199 | ret = invalidate_extent_cache(reloc_root, eb, group, root); | ||
5200 | BUG_ON(ret); | ||
5201 | free_extent_buffer(eb); | ||
5202 | } | ||
5203 | |||
5204 | mutex_unlock(&root->fs_info->tree_reloc_mutex); | ||
5205 | path->lowest_level = 0; | ||
5206 | return 0; | ||
5207 | } | ||
5208 | |||
5209 | static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, | ||
5210 | struct btrfs_root *root, | ||
5211 | struct btrfs_path *path, | ||
5212 | struct btrfs_key *first_key, | ||
5213 | struct btrfs_ref_path *ref_path) | ||
5214 | { | ||
5215 | int ret; | ||
5216 | |||
5217 | ret = relocate_one_path(trans, root, path, first_key, | ||
5218 | ref_path, NULL, NULL); | ||
5219 | BUG_ON(ret); | ||
5220 | |||
5221 | if (root == root->fs_info->extent_root) | ||
5222 | btrfs_extent_post_op(trans, root); | ||
5223 | |||
5224 | return 0; | ||
5225 | } | ||
5226 | |||
5227 | static noinline int del_extent_zero(struct btrfs_trans_handle *trans, | ||
5228 | struct btrfs_root *extent_root, | ||
5229 | struct btrfs_path *path, | ||
5230 | struct btrfs_key *extent_key) | ||
5231 | { | ||
5232 | int ret; | ||
5233 | |||
5234 | ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); | ||
5235 | if (ret) | ||
5236 | goto out; | ||
5237 | ret = btrfs_del_item(trans, extent_root, path); | ||
5238 | out: | ||
5239 | btrfs_release_path(extent_root, path); | ||
5240 | return ret; | ||
5241 | } | ||
5242 | |||
5243 | static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, | ||
5244 | struct btrfs_ref_path *ref_path) | ||
5245 | { | ||
5246 | struct btrfs_key root_key; | ||
5247 | |||
5248 | root_key.objectid = ref_path->root_objectid; | ||
5249 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
5250 | if (is_cowonly_root(ref_path->root_objectid)) | ||
5251 | root_key.offset = 0; | ||
5252 | else | ||
5253 | root_key.offset = (u64)-1; | ||
5254 | |||
5255 | return btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
5256 | } | ||
5257 | |||
5258 | static noinline int relocate_one_extent(struct btrfs_root *extent_root, | ||
5259 | struct btrfs_path *path, | ||
5260 | struct btrfs_key *extent_key, | ||
5261 | struct btrfs_block_group_cache *group, | ||
5262 | struct inode *reloc_inode, int pass) | ||
5263 | { | ||
5264 | struct btrfs_trans_handle *trans; | ||
5265 | struct btrfs_root *found_root; | ||
5266 | struct btrfs_ref_path *ref_path = NULL; | ||
5267 | struct disk_extent *new_extents = NULL; | ||
5268 | int nr_extents = 0; | ||
5269 | int loops; | ||
5270 | int ret; | ||
5271 | int level; | ||
5272 | struct btrfs_key first_key; | ||
5273 | u64 prev_block = 0; | ||
5274 | |||
5275 | |||
5276 | trans = btrfs_start_transaction(extent_root, 1); | ||
5277 | BUG_ON(!trans); | ||
5278 | |||
5279 | if (extent_key->objectid == 0) { | ||
5280 | ret = del_extent_zero(trans, extent_root, path, extent_key); | ||
5281 | goto out; | ||
5282 | } | ||
5283 | |||
5284 | ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); | ||
5285 | if (!ref_path) { | ||
5286 | ret = -ENOMEM; | ||
5287 | goto out; | ||
5288 | } | ||
5289 | |||
5290 | for (loops = 0; ; loops++) { | ||
5291 | if (loops == 0) { | ||
5292 | ret = btrfs_first_ref_path(trans, extent_root, ref_path, | ||
5293 | extent_key->objectid); | ||
5294 | } else { | ||
5295 | ret = btrfs_next_ref_path(trans, extent_root, ref_path); | ||
5296 | } | ||
5297 | if (ret < 0) | ||
5298 | goto out; | ||
5299 | if (ret > 0) | ||
5300 | break; | ||
5301 | |||
5302 | if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || | ||
5303 | ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) | ||
5304 | continue; | ||
5305 | |||
5306 | found_root = read_ref_root(extent_root->fs_info, ref_path); | ||
5307 | BUG_ON(!found_root); | ||
5308 | /* | ||
5309 | * for reference counted tree, only process reference paths | ||
5310 | * rooted at the latest committed root. | ||
5311 | */ | ||
5312 | if (found_root->ref_cows && | ||
5313 | ref_path->root_generation != found_root->root_key.offset) | ||
5314 | continue; | ||
5315 | |||
5316 | if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { | ||
5317 | if (pass == 0) { | ||
5318 | /* | ||
5319 | * copy data extents to new locations | ||
5320 | */ | ||
5321 | u64 group_start = group->key.objectid; | ||
5322 | ret = relocate_data_extent(reloc_inode, | ||
5323 | extent_key, | ||
5324 | group_start); | ||
5325 | if (ret < 0) | ||
5326 | goto out; | ||
5327 | break; | ||
5328 | } | ||
5329 | level = 0; | ||
5330 | } else { | ||
5331 | level = ref_path->owner_objectid; | ||
5332 | } | ||
5333 | |||
5334 | if (prev_block != ref_path->nodes[level]) { | ||
5335 | struct extent_buffer *eb; | ||
5336 | u64 block_start = ref_path->nodes[level]; | ||
5337 | u64 block_size = btrfs_level_size(found_root, level); | ||
5338 | |||
5339 | eb = read_tree_block(found_root, block_start, | ||
5340 | block_size, 0); | ||
5341 | btrfs_tree_lock(eb); | ||
5342 | BUG_ON(level != btrfs_header_level(eb)); | ||
5343 | |||
5344 | if (level == 0) | ||
5345 | btrfs_item_key_to_cpu(eb, &first_key, 0); | ||
5346 | else | ||
5347 | btrfs_node_key_to_cpu(eb, &first_key, 0); | ||
5348 | |||
5349 | btrfs_tree_unlock(eb); | ||
5350 | free_extent_buffer(eb); | ||
5351 | prev_block = block_start; | ||
5352 | } | ||
5353 | |||
5354 | btrfs_record_root_in_trans(found_root); | ||
5355 | if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { | ||
5356 | /* | ||
5357 | * try to update data extent references while | ||
5358 | * keeping metadata shared between snapshots. | ||
5359 | */ | ||
5360 | if (pass == 1) { | ||
5361 | ret = relocate_one_path(trans, found_root, | ||
5362 | path, &first_key, ref_path, | ||
5363 | group, reloc_inode); | ||
5364 | if (ret < 0) | ||
5365 | goto out; | ||
5366 | continue; | ||
5367 | } | ||
5368 | /* | ||
5369 | * use fallback method to process the remaining | ||
5370 | * references. | ||
5371 | */ | ||
5372 | if (!new_extents) { | ||
5373 | u64 group_start = group->key.objectid; | ||
5374 | new_extents = kmalloc(sizeof(*new_extents), | ||
5375 | GFP_NOFS); | ||
5376 | nr_extents = 1; | ||
5377 | ret = get_new_locations(reloc_inode, | ||
5378 | extent_key, | ||
5379 | group_start, 1, | ||
5380 | &new_extents, | ||
5381 | &nr_extents); | ||
5382 | if (ret) | ||
5383 | goto out; | ||
5384 | } | ||
5385 | ret = replace_one_extent(trans, found_root, | ||
5386 | path, extent_key, | ||
5387 | &first_key, ref_path, | ||
5388 | new_extents, nr_extents); | ||
5389 | } else { | ||
5390 | ret = relocate_tree_block(trans, found_root, path, | ||
5391 | &first_key, ref_path); | ||
5392 | } | ||
5393 | if (ret < 0) | ||
5394 | goto out; | ||
5395 | } | ||
5396 | ret = 0; | ||
5397 | out: | ||
5398 | btrfs_end_transaction(trans, extent_root); | ||
5399 | kfree(new_extents); | ||
5400 | kfree(ref_path); | ||
5401 | return ret; | ||
5402 | } | ||
5403 | |||
5404 | static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | ||
5405 | { | ||
5406 | u64 num_devices; | ||
5407 | u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | | ||
5408 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | ||
5409 | |||
5410 | num_devices = root->fs_info->fs_devices->rw_devices; | ||
5411 | if (num_devices == 1) { | ||
5412 | stripped |= BTRFS_BLOCK_GROUP_DUP; | ||
5413 | stripped = flags & ~stripped; | ||
5414 | |||
5415 | /* turn raid0 into single device chunks */ | ||
5416 | if (flags & BTRFS_BLOCK_GROUP_RAID0) | ||
5417 | return stripped; | ||
5418 | |||
5419 | /* turn mirroring into duplication */ | ||
5420 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | | ||
5421 | BTRFS_BLOCK_GROUP_RAID10)) | ||
5422 | return stripped | BTRFS_BLOCK_GROUP_DUP; | ||
5423 | return flags; | ||
5424 | } else { | ||
5425 | /* they already had raid on here, just return */ | ||
5426 | if (flags & stripped) | ||
5427 | return flags; | ||
5428 | |||
5429 | stripped |= BTRFS_BLOCK_GROUP_DUP; | ||
5430 | stripped = flags & ~stripped; | ||
5431 | |||
5432 | /* switch duplicated blocks with raid1 */ | ||
5433 | if (flags & BTRFS_BLOCK_GROUP_DUP) | ||
5434 | return stripped | BTRFS_BLOCK_GROUP_RAID1; | ||
5435 | |||
5436 | /* turn single device chunks into raid0 */ | ||
5437 | return stripped | BTRFS_BLOCK_GROUP_RAID0; | ||
5438 | } | ||
5439 | return flags; | ||
5440 | } | ||
5441 | |||
5442 | static int __alloc_chunk_for_shrink(struct btrfs_root *root, | ||
5443 | struct btrfs_block_group_cache *shrink_block_group, | ||
5444 | int force) | ||
5445 | { | ||
5446 | struct btrfs_trans_handle *trans; | ||
5447 | u64 new_alloc_flags; | ||
5448 | u64 calc; | ||
5449 | |||
5450 | spin_lock(&shrink_block_group->lock); | ||
5451 | if (btrfs_block_group_used(&shrink_block_group->item) > 0) { | ||
5452 | spin_unlock(&shrink_block_group->lock); | ||
5453 | |||
5454 | trans = btrfs_start_transaction(root, 1); | ||
5455 | spin_lock(&shrink_block_group->lock); | ||
5456 | |||
5457 | new_alloc_flags = update_block_group_flags(root, | ||
5458 | shrink_block_group->flags); | ||
5459 | if (new_alloc_flags != shrink_block_group->flags) { | ||
5460 | calc = | ||
5461 | btrfs_block_group_used(&shrink_block_group->item); | ||
5462 | } else { | ||
5463 | calc = shrink_block_group->key.offset; | ||
5464 | } | ||
5465 | spin_unlock(&shrink_block_group->lock); | ||
5466 | |||
5467 | do_chunk_alloc(trans, root->fs_info->extent_root, | ||
5468 | calc + 2 * 1024 * 1024, new_alloc_flags, force); | ||
5469 | |||
5470 | btrfs_end_transaction(trans, root); | ||
5471 | } else | ||
5472 | spin_unlock(&shrink_block_group->lock); | ||
5473 | return 0; | ||
5474 | } | ||
5475 | |||
5476 | static int __insert_orphan_inode(struct btrfs_trans_handle *trans, | ||
5477 | struct btrfs_root *root, | ||
5478 | u64 objectid, u64 size) | ||
5479 | { | ||
5480 | struct btrfs_path *path; | ||
5481 | struct btrfs_inode_item *item; | ||
5482 | struct extent_buffer *leaf; | ||
5483 | int ret; | ||
5484 | |||
5485 | path = btrfs_alloc_path(); | ||
5486 | if (!path) | ||
5487 | return -ENOMEM; | ||
5488 | |||
5489 | ret = btrfs_insert_empty_inode(trans, root, path, objectid); | ||
5490 | if (ret) | ||
5491 | goto out; | ||
5492 | |||
5493 | leaf = path->nodes[0]; | ||
5494 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); | ||
5495 | memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); | ||
5496 | btrfs_set_inode_generation(leaf, item, 1); | ||
5497 | btrfs_set_inode_size(leaf, item, size); | ||
5498 | btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); | ||
5499 | btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); | ||
5500 | btrfs_mark_buffer_dirty(leaf); | ||
5501 | btrfs_release_path(root, path); | ||
5502 | out: | ||
5503 | btrfs_free_path(path); | ||
5504 | return ret; | ||
5505 | } | ||
5506 | |||
5507 | static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, | ||
5508 | struct btrfs_block_group_cache *group) | ||
5509 | { | ||
5510 | struct inode *inode = NULL; | ||
5511 | struct btrfs_trans_handle *trans; | ||
5512 | struct btrfs_root *root; | ||
5513 | struct btrfs_key root_key; | ||
5514 | u64 objectid = BTRFS_FIRST_FREE_OBJECTID; | ||
5515 | int err = 0; | ||
5516 | |||
5517 | root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; | ||
5518 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
5519 | root_key.offset = (u64)-1; | ||
5520 | root = btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
5521 | if (IS_ERR(root)) | ||
5522 | return ERR_CAST(root); | ||
5523 | |||
5524 | trans = btrfs_start_transaction(root, 1); | ||
5525 | BUG_ON(!trans); | ||
5526 | |||
5527 | err = btrfs_find_free_objectid(trans, root, objectid, &objectid); | ||
5528 | if (err) | ||
5529 | goto out; | ||
5530 | |||
5531 | err = __insert_orphan_inode(trans, root, objectid, group->key.offset); | ||
5532 | BUG_ON(err); | ||
5533 | |||
5534 | err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, | ||
5535 | group->key.offset, 0, group->key.offset, | ||
5536 | 0, 0, 0); | ||
5537 | BUG_ON(err); | ||
5538 | |||
5539 | inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); | ||
5540 | if (inode->i_state & I_NEW) { | ||
5541 | BTRFS_I(inode)->root = root; | ||
5542 | BTRFS_I(inode)->location.objectid = objectid; | ||
5543 | BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; | ||
5544 | BTRFS_I(inode)->location.offset = 0; | ||
5545 | btrfs_read_locked_inode(inode); | ||
5546 | unlock_new_inode(inode); | ||
5547 | BUG_ON(is_bad_inode(inode)); | ||
5548 | } else { | ||
5549 | BUG_ON(1); | ||
5550 | } | ||
5551 | BTRFS_I(inode)->index_cnt = group->key.objectid; | ||
5552 | |||
5553 | err = btrfs_orphan_add(trans, inode); | ||
5554 | out: | ||
5555 | btrfs_end_transaction(trans, root); | ||
5556 | if (err) { | ||
5557 | if (inode) | ||
5558 | iput(inode); | ||
5559 | inode = ERR_PTR(err); | ||
5560 | } | ||
5561 | return inode; | ||
5562 | } | ||
5563 | |||
5564 | int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) | ||
5565 | { | ||
5566 | |||
5567 | struct btrfs_ordered_sum *sums; | ||
5568 | struct btrfs_sector_sum *sector_sum; | ||
5569 | struct btrfs_ordered_extent *ordered; | ||
5570 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
5571 | struct list_head list; | ||
5572 | size_t offset; | ||
5573 | int ret; | ||
5574 | u64 disk_bytenr; | ||
5575 | |||
5576 | INIT_LIST_HEAD(&list); | ||
5577 | |||
5578 | ordered = btrfs_lookup_ordered_extent(inode, file_pos); | ||
5579 | BUG_ON(ordered->file_offset != file_pos || ordered->len != len); | ||
5580 | |||
5581 | disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; | ||
5582 | ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, | ||
5583 | disk_bytenr + len - 1, &list); | ||
5584 | |||
5585 | while (!list_empty(&list)) { | ||
5586 | sums = list_entry(list.next, struct btrfs_ordered_sum, list); | ||
5587 | list_del_init(&sums->list); | ||
5588 | |||
5589 | sector_sum = sums->sums; | ||
5590 | sums->bytenr = ordered->start; | ||
5591 | |||
5592 | offset = 0; | ||
5593 | while (offset < sums->len) { | ||
5594 | sector_sum->bytenr += ordered->start - disk_bytenr; | ||
5595 | sector_sum++; | ||
5596 | offset += root->sectorsize; | ||
5597 | } | ||
5598 | |||
5599 | btrfs_add_ordered_sum(inode, ordered, sums); | ||
5600 | } | ||
5601 | btrfs_put_ordered_extent(ordered); | ||
5602 | return 0; | ||
5603 | } | ||
5604 | |||
5605 | int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) | ||
5606 | { | ||
5607 | struct btrfs_trans_handle *trans; | ||
5608 | struct btrfs_path *path; | ||
5609 | struct btrfs_fs_info *info = root->fs_info; | ||
5610 | struct extent_buffer *leaf; | ||
5611 | struct inode *reloc_inode; | ||
5612 | struct btrfs_block_group_cache *block_group; | ||
5613 | struct btrfs_key key; | ||
5614 | u64 skipped; | ||
5615 | u64 cur_byte; | ||
5616 | u64 total_found; | ||
5617 | u32 nritems; | ||
5618 | int ret; | ||
5619 | int progress; | ||
5620 | int pass = 0; | ||
5621 | |||
5622 | root = root->fs_info->extent_root; | ||
5623 | |||
5624 | block_group = btrfs_lookup_block_group(info, group_start); | ||
5625 | BUG_ON(!block_group); | ||
5626 | |||
5627 | printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", | ||
5628 | (unsigned long long)block_group->key.objectid, | ||
5629 | (unsigned long long)block_group->flags); | ||
5630 | |||
5631 | path = btrfs_alloc_path(); | ||
5632 | BUG_ON(!path); | ||
5633 | |||
5634 | reloc_inode = create_reloc_inode(info, block_group); | ||
5635 | BUG_ON(IS_ERR(reloc_inode)); | ||
5636 | |||
5637 | __alloc_chunk_for_shrink(root, block_group, 1); | ||
5638 | set_block_group_readonly(block_group); | ||
5639 | |||
5640 | btrfs_start_delalloc_inodes(info->tree_root); | ||
5641 | btrfs_wait_ordered_extents(info->tree_root, 0); | ||
5642 | again: | ||
5643 | skipped = 0; | ||
5644 | total_found = 0; | ||
5645 | progress = 0; | ||
5646 | key.objectid = block_group->key.objectid; | ||
5647 | key.offset = 0; | ||
5648 | key.type = 0; | ||
5649 | cur_byte = key.objectid; | ||
5650 | |||
5651 | trans = btrfs_start_transaction(info->tree_root, 1); | ||
5652 | btrfs_commit_transaction(trans, info->tree_root); | ||
5653 | |||
5654 | mutex_lock(&root->fs_info->cleaner_mutex); | ||
5655 | btrfs_clean_old_snapshots(info->tree_root); | ||
5656 | btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); | ||
5657 | mutex_unlock(&root->fs_info->cleaner_mutex); | ||
5658 | |||
5659 | while (1) { | ||
5660 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
5661 | if (ret < 0) | ||
5662 | goto out; | ||
5663 | next: | ||
5664 | leaf = path->nodes[0]; | ||
5665 | nritems = btrfs_header_nritems(leaf); | ||
5666 | if (path->slots[0] >= nritems) { | ||
5667 | ret = btrfs_next_leaf(root, path); | ||
5668 | if (ret < 0) | ||
5669 | goto out; | ||
5670 | if (ret == 1) { | ||
5671 | ret = 0; | ||
5672 | break; | ||
5673 | } | ||
5674 | leaf = path->nodes[0]; | ||
5675 | nritems = btrfs_header_nritems(leaf); | ||
5676 | } | ||
5677 | |||
5678 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
5679 | |||
5680 | if (key.objectid >= block_group->key.objectid + | ||
5681 | block_group->key.offset) | ||
5682 | break; | ||
5683 | |||
5684 | if (progress && need_resched()) { | ||
5685 | btrfs_release_path(root, path); | ||
5686 | cond_resched(); | ||
5687 | progress = 0; | ||
5688 | continue; | ||
5689 | } | ||
5690 | progress = 1; | ||
5691 | |||
5692 | if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || | ||
5693 | key.objectid + key.offset <= cur_byte) { | ||
5694 | path->slots[0]++; | ||
5695 | goto next; | ||
5696 | } | ||
5697 | |||
5698 | total_found++; | ||
5699 | cur_byte = key.objectid + key.offset; | ||
5700 | btrfs_release_path(root, path); | ||
5701 | |||
5702 | __alloc_chunk_for_shrink(root, block_group, 0); | ||
5703 | ret = relocate_one_extent(root, path, &key, block_group, | ||
5704 | reloc_inode, pass); | ||
5705 | BUG_ON(ret < 0); | ||
5706 | if (ret > 0) | ||
5707 | skipped++; | ||
5708 | |||
5709 | key.objectid = cur_byte; | ||
5710 | key.type = 0; | ||
5711 | key.offset = 0; | ||
5712 | } | ||
5713 | |||
5714 | btrfs_release_path(root, path); | ||
5715 | |||
5716 | if (pass == 0) { | ||
5717 | btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); | ||
5718 | invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); | ||
5719 | } | ||
5720 | |||
5721 | if (total_found > 0) { | ||
5722 | printk(KERN_INFO "btrfs found %llu extents in pass %d\n", | ||
5723 | (unsigned long long)total_found, pass); | ||
5724 | pass++; | ||
5725 | if (total_found == skipped && pass > 2) { | ||
5726 | iput(reloc_inode); | ||
5727 | reloc_inode = create_reloc_inode(info, block_group); | ||
5728 | pass = 0; | ||
5729 | } | ||
5730 | goto again; | ||
5731 | } | ||
5732 | |||
5733 | /* delete reloc_inode */ | ||
5734 | iput(reloc_inode); | ||
5735 | |||
5736 | /* unpin extents in this range */ | ||
5737 | trans = btrfs_start_transaction(info->tree_root, 1); | ||
5738 | btrfs_commit_transaction(trans, info->tree_root); | ||
5739 | |||
5740 | spin_lock(&block_group->lock); | ||
5741 | WARN_ON(block_group->pinned > 0); | ||
5742 | WARN_ON(block_group->reserved > 0); | ||
5743 | WARN_ON(btrfs_block_group_used(&block_group->item) > 0); | ||
5744 | spin_unlock(&block_group->lock); | ||
5745 | put_block_group(block_group); | ||
5746 | ret = 0; | ||
5747 | out: | ||
5748 | btrfs_free_path(path); | ||
5749 | return ret; | ||
5750 | } | ||
5751 | |||
5752 | static int find_first_block_group(struct btrfs_root *root, | ||
5753 | struct btrfs_path *path, struct btrfs_key *key) | ||
5754 | { | ||
5755 | int ret = 0; | ||
5756 | struct btrfs_key found_key; | ||
5757 | struct extent_buffer *leaf; | ||
5758 | int slot; | ||
5759 | |||
5760 | ret = btrfs_search_slot(NULL, root, key, path, 0, 0); | ||
5761 | if (ret < 0) | ||
5762 | goto out; | ||
5763 | |||
5764 | while (1) { | ||
5765 | slot = path->slots[0]; | ||
5766 | leaf = path->nodes[0]; | ||
5767 | if (slot >= btrfs_header_nritems(leaf)) { | ||
5768 | ret = btrfs_next_leaf(root, path); | ||
5769 | if (ret == 0) | ||
5770 | continue; | ||
5771 | if (ret < 0) | ||
5772 | goto out; | ||
5773 | break; | ||
5774 | } | ||
5775 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
5776 | |||
5777 | if (found_key.objectid >= key->objectid && | ||
5778 | found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { | ||
5779 | ret = 0; | ||
5780 | goto out; | ||
5781 | } | ||
5782 | path->slots[0]++; | ||
5783 | } | ||
5784 | ret = -ENOENT; | ||
5785 | out: | ||
5786 | return ret; | ||
5787 | } | ||
5788 | |||
5789 | int btrfs_free_block_groups(struct btrfs_fs_info *info) | ||
5790 | { | ||
5791 | struct btrfs_block_group_cache *block_group; | ||
5792 | struct rb_node *n; | ||
5793 | |||
5794 | spin_lock(&info->block_group_cache_lock); | ||
5795 | while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { | ||
5796 | block_group = rb_entry(n, struct btrfs_block_group_cache, | ||
5797 | cache_node); | ||
5798 | rb_erase(&block_group->cache_node, | ||
5799 | &info->block_group_cache_tree); | ||
5800 | spin_unlock(&info->block_group_cache_lock); | ||
5801 | |||
5802 | btrfs_remove_free_space_cache(block_group); | ||
5803 | down_write(&block_group->space_info->groups_sem); | ||
5804 | list_del(&block_group->list); | ||
5805 | up_write(&block_group->space_info->groups_sem); | ||
5806 | |||
5807 | WARN_ON(atomic_read(&block_group->count) != 1); | ||
5808 | kfree(block_group); | ||
5809 | |||
5810 | spin_lock(&info->block_group_cache_lock); | ||
5811 | } | ||
5812 | spin_unlock(&info->block_group_cache_lock); | ||
5813 | return 0; | ||
5814 | } | ||
5815 | |||
5816 | int btrfs_read_block_groups(struct btrfs_root *root) | ||
5817 | { | ||
5818 | struct btrfs_path *path; | ||
5819 | int ret; | ||
5820 | struct btrfs_block_group_cache *cache; | ||
5821 | struct btrfs_fs_info *info = root->fs_info; | ||
5822 | struct btrfs_space_info *space_info; | ||
5823 | struct btrfs_key key; | ||
5824 | struct btrfs_key found_key; | ||
5825 | struct extent_buffer *leaf; | ||
5826 | |||
5827 | root = info->extent_root; | ||
5828 | key.objectid = 0; | ||
5829 | key.offset = 0; | ||
5830 | btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); | ||
5831 | path = btrfs_alloc_path(); | ||
5832 | if (!path) | ||
5833 | return -ENOMEM; | ||
5834 | |||
5835 | while (1) { | ||
5836 | ret = find_first_block_group(root, path, &key); | ||
5837 | if (ret > 0) { | ||
5838 | ret = 0; | ||
5839 | goto error; | ||
5840 | } | ||
5841 | if (ret != 0) | ||
5842 | goto error; | ||
5843 | |||
5844 | leaf = path->nodes[0]; | ||
5845 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
5846 | cache = kzalloc(sizeof(*cache), GFP_NOFS); | ||
5847 | if (!cache) { | ||
5848 | ret = -ENOMEM; | ||
5849 | break; | ||
5850 | } | ||
5851 | |||
5852 | atomic_set(&cache->count, 1); | ||
5853 | spin_lock_init(&cache->lock); | ||
5854 | mutex_init(&cache->alloc_mutex); | ||
5855 | mutex_init(&cache->cache_mutex); | ||
5856 | INIT_LIST_HEAD(&cache->list); | ||
5857 | read_extent_buffer(leaf, &cache->item, | ||
5858 | btrfs_item_ptr_offset(leaf, path->slots[0]), | ||
5859 | sizeof(cache->item)); | ||
5860 | memcpy(&cache->key, &found_key, sizeof(found_key)); | ||
5861 | |||
5862 | key.objectid = found_key.objectid + found_key.offset; | ||
5863 | btrfs_release_path(root, path); | ||
5864 | cache->flags = btrfs_block_group_flags(&cache->item); | ||
5865 | |||
5866 | ret = update_space_info(info, cache->flags, found_key.offset, | ||
5867 | btrfs_block_group_used(&cache->item), | ||
5868 | &space_info); | ||
5869 | BUG_ON(ret); | ||
5870 | cache->space_info = space_info; | ||
5871 | down_write(&space_info->groups_sem); | ||
5872 | list_add_tail(&cache->list, &space_info->block_groups); | ||
5873 | up_write(&space_info->groups_sem); | ||
5874 | |||
5875 | ret = btrfs_add_block_group_cache(root->fs_info, cache); | ||
5876 | BUG_ON(ret); | ||
5877 | |||
5878 | set_avail_alloc_bits(root->fs_info, cache->flags); | ||
5879 | if (btrfs_chunk_readonly(root, cache->key.objectid)) | ||
5880 | set_block_group_readonly(cache); | ||
5881 | } | ||
5882 | ret = 0; | ||
5883 | error: | ||
5884 | btrfs_free_path(path); | ||
5885 | return ret; | ||
5886 | } | ||
5887 | |||
5888 | int btrfs_make_block_group(struct btrfs_trans_handle *trans, | ||
5889 | struct btrfs_root *root, u64 bytes_used, | ||
5890 | u64 type, u64 chunk_objectid, u64 chunk_offset, | ||
5891 | u64 size) | ||
5892 | { | ||
5893 | int ret; | ||
5894 | struct btrfs_root *extent_root; | ||
5895 | struct btrfs_block_group_cache *cache; | ||
5896 | |||
5897 | extent_root = root->fs_info->extent_root; | ||
5898 | |||
5899 | root->fs_info->last_trans_new_blockgroup = trans->transid; | ||
5900 | |||
5901 | cache = kzalloc(sizeof(*cache), GFP_NOFS); | ||
5902 | if (!cache) | ||
5903 | return -ENOMEM; | ||
5904 | |||
5905 | cache->key.objectid = chunk_offset; | ||
5906 | cache->key.offset = size; | ||
5907 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; | ||
5908 | atomic_set(&cache->count, 1); | ||
5909 | spin_lock_init(&cache->lock); | ||
5910 | mutex_init(&cache->alloc_mutex); | ||
5911 | mutex_init(&cache->cache_mutex); | ||
5912 | INIT_LIST_HEAD(&cache->list); | ||
5913 | |||
5914 | btrfs_set_block_group_used(&cache->item, bytes_used); | ||
5915 | btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); | ||
5916 | cache->flags = type; | ||
5917 | btrfs_set_block_group_flags(&cache->item, type); | ||
5918 | |||
5919 | ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, | ||
5920 | &cache->space_info); | ||
5921 | BUG_ON(ret); | ||
5922 | down_write(&cache->space_info->groups_sem); | ||
5923 | list_add_tail(&cache->list, &cache->space_info->block_groups); | ||
5924 | up_write(&cache->space_info->groups_sem); | ||
5925 | |||
5926 | ret = btrfs_add_block_group_cache(root->fs_info, cache); | ||
5927 | BUG_ON(ret); | ||
5928 | |||
5929 | ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, | ||
5930 | sizeof(cache->item)); | ||
5931 | BUG_ON(ret); | ||
5932 | |||
5933 | finish_current_insert(trans, extent_root, 0); | ||
5934 | ret = del_pending_extents(trans, extent_root, 0); | ||
5935 | BUG_ON(ret); | ||
5936 | set_avail_alloc_bits(extent_root->fs_info, type); | ||
5937 | |||
5938 | return 0; | ||
5939 | } | ||
5940 | |||
5941 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | ||
5942 | struct btrfs_root *root, u64 group_start) | ||
5943 | { | ||
5944 | struct btrfs_path *path; | ||
5945 | struct btrfs_block_group_cache *block_group; | ||
5946 | struct btrfs_key key; | ||
5947 | int ret; | ||
5948 | |||
5949 | root = root->fs_info->extent_root; | ||
5950 | |||
5951 | block_group = btrfs_lookup_block_group(root->fs_info, group_start); | ||
5952 | BUG_ON(!block_group); | ||
5953 | BUG_ON(!block_group->ro); | ||
5954 | |||
5955 | memcpy(&key, &block_group->key, sizeof(key)); | ||
5956 | |||
5957 | path = btrfs_alloc_path(); | ||
5958 | BUG_ON(!path); | ||
5959 | |||
5960 | btrfs_remove_free_space_cache(block_group); | ||
5961 | rb_erase(&block_group->cache_node, | ||
5962 | &root->fs_info->block_group_cache_tree); | ||
5963 | down_write(&block_group->space_info->groups_sem); | ||
5964 | list_del(&block_group->list); | ||
5965 | up_write(&block_group->space_info->groups_sem); | ||
5966 | |||
5967 | spin_lock(&block_group->space_info->lock); | ||
5968 | block_group->space_info->total_bytes -= block_group->key.offset; | ||
5969 | block_group->space_info->bytes_readonly -= block_group->key.offset; | ||
5970 | spin_unlock(&block_group->space_info->lock); | ||
5971 | block_group->space_info->full = 0; | ||
5972 | |||
5973 | put_block_group(block_group); | ||
5974 | put_block_group(block_group); | ||
5975 | |||
5976 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
5977 | if (ret > 0) | ||
5978 | ret = -EIO; | ||
5979 | if (ret < 0) | ||
5980 | goto out; | ||
5981 | |||
5982 | ret = btrfs_del_item(trans, root, path); | ||
5983 | out: | ||
5984 | btrfs_free_path(path); | ||
5985 | return ret; | ||
5986 | } | ||
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 000000000000..e086d407f1fa --- /dev/null +++ b/fs/btrfs/extent_io.c | |||
@@ -0,0 +1,3717 @@ | |||
1 | #include <linux/bitops.h> | ||
2 | #include <linux/slab.h> | ||
3 | #include <linux/bio.h> | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/gfp.h> | ||
6 | #include <linux/pagemap.h> | ||
7 | #include <linux/page-flags.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/spinlock.h> | ||
10 | #include <linux/blkdev.h> | ||
11 | #include <linux/swap.h> | ||
12 | #include <linux/version.h> | ||
13 | #include <linux/writeback.h> | ||
14 | #include <linux/pagevec.h> | ||
15 | #include "extent_io.h" | ||
16 | #include "extent_map.h" | ||
17 | #include "compat.h" | ||
18 | #include "ctree.h" | ||
19 | #include "btrfs_inode.h" | ||
20 | |||
21 | /* temporary define until extent_map moves out of btrfs */ | ||
22 | struct kmem_cache *btrfs_cache_create(const char *name, size_t size, | ||
23 | unsigned long extra_flags, | ||
24 | void (*ctor)(void *, struct kmem_cache *, | ||
25 | unsigned long)); | ||
26 | |||
27 | static struct kmem_cache *extent_state_cache; | ||
28 | static struct kmem_cache *extent_buffer_cache; | ||
29 | |||
30 | static LIST_HEAD(buffers); | ||
31 | static LIST_HEAD(states); | ||
32 | |||
33 | #define LEAK_DEBUG 0 | ||
34 | #ifdef LEAK_DEBUG | ||
35 | static DEFINE_SPINLOCK(leak_lock); | ||
36 | #endif | ||
37 | |||
38 | #define BUFFER_LRU_MAX 64 | ||
39 | |||
40 | struct tree_entry { | ||
41 | u64 start; | ||
42 | u64 end; | ||
43 | struct rb_node rb_node; | ||
44 | }; | ||
45 | |||
46 | struct extent_page_data { | ||
47 | struct bio *bio; | ||
48 | struct extent_io_tree *tree; | ||
49 | get_extent_t *get_extent; | ||
50 | |||
51 | /* tells writepage not to lock the state bits for this range | ||
52 | * it still does the unlocking | ||
53 | */ | ||
54 | int extent_locked; | ||
55 | }; | ||
56 | |||
57 | int __init extent_io_init(void) | ||
58 | { | ||
59 | extent_state_cache = btrfs_cache_create("extent_state", | ||
60 | sizeof(struct extent_state), 0, | ||
61 | NULL); | ||
62 | if (!extent_state_cache) | ||
63 | return -ENOMEM; | ||
64 | |||
65 | extent_buffer_cache = btrfs_cache_create("extent_buffers", | ||
66 | sizeof(struct extent_buffer), 0, | ||
67 | NULL); | ||
68 | if (!extent_buffer_cache) | ||
69 | goto free_state_cache; | ||
70 | return 0; | ||
71 | |||
72 | free_state_cache: | ||
73 | kmem_cache_destroy(extent_state_cache); | ||
74 | return -ENOMEM; | ||
75 | } | ||
76 | |||
77 | void extent_io_exit(void) | ||
78 | { | ||
79 | struct extent_state *state; | ||
80 | struct extent_buffer *eb; | ||
81 | |||
82 | while (!list_empty(&states)) { | ||
83 | state = list_entry(states.next, struct extent_state, leak_list); | ||
84 | printk(KERN_ERR "btrfs state leak: start %llu end %llu " | ||
85 | "state %lu in tree %p refs %d\n", | ||
86 | (unsigned long long)state->start, | ||
87 | (unsigned long long)state->end, | ||
88 | state->state, state->tree, atomic_read(&state->refs)); | ||
89 | list_del(&state->leak_list); | ||
90 | kmem_cache_free(extent_state_cache, state); | ||
91 | |||
92 | } | ||
93 | |||
94 | while (!list_empty(&buffers)) { | ||
95 | eb = list_entry(buffers.next, struct extent_buffer, leak_list); | ||
96 | printk(KERN_ERR "btrfs buffer leak start %llu len %lu " | ||
97 | "refs %d\n", (unsigned long long)eb->start, | ||
98 | eb->len, atomic_read(&eb->refs)); | ||
99 | list_del(&eb->leak_list); | ||
100 | kmem_cache_free(extent_buffer_cache, eb); | ||
101 | } | ||
102 | if (extent_state_cache) | ||
103 | kmem_cache_destroy(extent_state_cache); | ||
104 | if (extent_buffer_cache) | ||
105 | kmem_cache_destroy(extent_buffer_cache); | ||
106 | } | ||
107 | |||
108 | void extent_io_tree_init(struct extent_io_tree *tree, | ||
109 | struct address_space *mapping, gfp_t mask) | ||
110 | { | ||
111 | tree->state.rb_node = NULL; | ||
112 | tree->buffer.rb_node = NULL; | ||
113 | tree->ops = NULL; | ||
114 | tree->dirty_bytes = 0; | ||
115 | spin_lock_init(&tree->lock); | ||
116 | spin_lock_init(&tree->buffer_lock); | ||
117 | tree->mapping = mapping; | ||
118 | } | ||
119 | |||
120 | static struct extent_state *alloc_extent_state(gfp_t mask) | ||
121 | { | ||
122 | struct extent_state *state; | ||
123 | #ifdef LEAK_DEBUG | ||
124 | unsigned long flags; | ||
125 | #endif | ||
126 | |||
127 | state = kmem_cache_alloc(extent_state_cache, mask); | ||
128 | if (!state) | ||
129 | return state; | ||
130 | state->state = 0; | ||
131 | state->private = 0; | ||
132 | state->tree = NULL; | ||
133 | #ifdef LEAK_DEBUG | ||
134 | spin_lock_irqsave(&leak_lock, flags); | ||
135 | list_add(&state->leak_list, &states); | ||
136 | spin_unlock_irqrestore(&leak_lock, flags); | ||
137 | #endif | ||
138 | atomic_set(&state->refs, 1); | ||
139 | init_waitqueue_head(&state->wq); | ||
140 | return state; | ||
141 | } | ||
142 | |||
143 | static void free_extent_state(struct extent_state *state) | ||
144 | { | ||
145 | if (!state) | ||
146 | return; | ||
147 | if (atomic_dec_and_test(&state->refs)) { | ||
148 | #ifdef LEAK_DEBUG | ||
149 | unsigned long flags; | ||
150 | #endif | ||
151 | WARN_ON(state->tree); | ||
152 | #ifdef LEAK_DEBUG | ||
153 | spin_lock_irqsave(&leak_lock, flags); | ||
154 | list_del(&state->leak_list); | ||
155 | spin_unlock_irqrestore(&leak_lock, flags); | ||
156 | #endif | ||
157 | kmem_cache_free(extent_state_cache, state); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static struct rb_node *tree_insert(struct rb_root *root, u64 offset, | ||
162 | struct rb_node *node) | ||
163 | { | ||
164 | struct rb_node **p = &root->rb_node; | ||
165 | struct rb_node *parent = NULL; | ||
166 | struct tree_entry *entry; | ||
167 | |||
168 | while (*p) { | ||
169 | parent = *p; | ||
170 | entry = rb_entry(parent, struct tree_entry, rb_node); | ||
171 | |||
172 | if (offset < entry->start) | ||
173 | p = &(*p)->rb_left; | ||
174 | else if (offset > entry->end) | ||
175 | p = &(*p)->rb_right; | ||
176 | else | ||
177 | return parent; | ||
178 | } | ||
179 | |||
180 | entry = rb_entry(node, struct tree_entry, rb_node); | ||
181 | rb_link_node(node, parent, p); | ||
182 | rb_insert_color(node, root); | ||
183 | return NULL; | ||
184 | } | ||
185 | |||
186 | static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, | ||
187 | struct rb_node **prev_ret, | ||
188 | struct rb_node **next_ret) | ||
189 | { | ||
190 | struct rb_root *root = &tree->state; | ||
191 | struct rb_node *n = root->rb_node; | ||
192 | struct rb_node *prev = NULL; | ||
193 | struct rb_node *orig_prev = NULL; | ||
194 | struct tree_entry *entry; | ||
195 | struct tree_entry *prev_entry = NULL; | ||
196 | |||
197 | while (n) { | ||
198 | entry = rb_entry(n, struct tree_entry, rb_node); | ||
199 | prev = n; | ||
200 | prev_entry = entry; | ||
201 | |||
202 | if (offset < entry->start) | ||
203 | n = n->rb_left; | ||
204 | else if (offset > entry->end) | ||
205 | n = n->rb_right; | ||
206 | else | ||
207 | return n; | ||
208 | } | ||
209 | |||
210 | if (prev_ret) { | ||
211 | orig_prev = prev; | ||
212 | while (prev && offset > prev_entry->end) { | ||
213 | prev = rb_next(prev); | ||
214 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | ||
215 | } | ||
216 | *prev_ret = prev; | ||
217 | prev = orig_prev; | ||
218 | } | ||
219 | |||
220 | if (next_ret) { | ||
221 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | ||
222 | while (prev && offset < prev_entry->start) { | ||
223 | prev = rb_prev(prev); | ||
224 | prev_entry = rb_entry(prev, struct tree_entry, rb_node); | ||
225 | } | ||
226 | *next_ret = prev; | ||
227 | } | ||
228 | return NULL; | ||
229 | } | ||
230 | |||
231 | static inline struct rb_node *tree_search(struct extent_io_tree *tree, | ||
232 | u64 offset) | ||
233 | { | ||
234 | struct rb_node *prev = NULL; | ||
235 | struct rb_node *ret; | ||
236 | |||
237 | ret = __etree_search(tree, offset, &prev, NULL); | ||
238 | if (!ret) | ||
239 | return prev; | ||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, | ||
244 | u64 offset, struct rb_node *node) | ||
245 | { | ||
246 | struct rb_root *root = &tree->buffer; | ||
247 | struct rb_node **p = &root->rb_node; | ||
248 | struct rb_node *parent = NULL; | ||
249 | struct extent_buffer *eb; | ||
250 | |||
251 | while (*p) { | ||
252 | parent = *p; | ||
253 | eb = rb_entry(parent, struct extent_buffer, rb_node); | ||
254 | |||
255 | if (offset < eb->start) | ||
256 | p = &(*p)->rb_left; | ||
257 | else if (offset > eb->start) | ||
258 | p = &(*p)->rb_right; | ||
259 | else | ||
260 | return eb; | ||
261 | } | ||
262 | |||
263 | rb_link_node(node, parent, p); | ||
264 | rb_insert_color(node, root); | ||
265 | return NULL; | ||
266 | } | ||
267 | |||
268 | static struct extent_buffer *buffer_search(struct extent_io_tree *tree, | ||
269 | u64 offset) | ||
270 | { | ||
271 | struct rb_root *root = &tree->buffer; | ||
272 | struct rb_node *n = root->rb_node; | ||
273 | struct extent_buffer *eb; | ||
274 | |||
275 | while (n) { | ||
276 | eb = rb_entry(n, struct extent_buffer, rb_node); | ||
277 | if (offset < eb->start) | ||
278 | n = n->rb_left; | ||
279 | else if (offset > eb->start) | ||
280 | n = n->rb_right; | ||
281 | else | ||
282 | return eb; | ||
283 | } | ||
284 | return NULL; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * utility function to look for merge candidates inside a given range. | ||
289 | * Any extents with matching state are merged together into a single | ||
290 | * extent in the tree. Extents with EXTENT_IO in their state field | ||
291 | * are not merged because the end_io handlers need to be able to do | ||
292 | * operations on them without sleeping (or doing allocations/splits). | ||
293 | * | ||
294 | * This should be called with the tree lock held. | ||
295 | */ | ||
296 | static int merge_state(struct extent_io_tree *tree, | ||
297 | struct extent_state *state) | ||
298 | { | ||
299 | struct extent_state *other; | ||
300 | struct rb_node *other_node; | ||
301 | |||
302 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) | ||
303 | return 0; | ||
304 | |||
305 | other_node = rb_prev(&state->rb_node); | ||
306 | if (other_node) { | ||
307 | other = rb_entry(other_node, struct extent_state, rb_node); | ||
308 | if (other->end == state->start - 1 && | ||
309 | other->state == state->state) { | ||
310 | state->start = other->start; | ||
311 | other->tree = NULL; | ||
312 | rb_erase(&other->rb_node, &tree->state); | ||
313 | free_extent_state(other); | ||
314 | } | ||
315 | } | ||
316 | other_node = rb_next(&state->rb_node); | ||
317 | if (other_node) { | ||
318 | other = rb_entry(other_node, struct extent_state, rb_node); | ||
319 | if (other->start == state->end + 1 && | ||
320 | other->state == state->state) { | ||
321 | other->start = state->start; | ||
322 | state->tree = NULL; | ||
323 | rb_erase(&state->rb_node, &tree->state); | ||
324 | free_extent_state(state); | ||
325 | } | ||
326 | } | ||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | static void set_state_cb(struct extent_io_tree *tree, | ||
331 | struct extent_state *state, | ||
332 | unsigned long bits) | ||
333 | { | ||
334 | if (tree->ops && tree->ops->set_bit_hook) { | ||
335 | tree->ops->set_bit_hook(tree->mapping->host, state->start, | ||
336 | state->end, state->state, bits); | ||
337 | } | ||
338 | } | ||
339 | |||
340 | static void clear_state_cb(struct extent_io_tree *tree, | ||
341 | struct extent_state *state, | ||
342 | unsigned long bits) | ||
343 | { | ||
344 | if (tree->ops && tree->ops->clear_bit_hook) { | ||
345 | tree->ops->clear_bit_hook(tree->mapping->host, state->start, | ||
346 | state->end, state->state, bits); | ||
347 | } | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * insert an extent_state struct into the tree. 'bits' are set on the | ||
352 | * struct before it is inserted. | ||
353 | * | ||
354 | * This may return -EEXIST if the extent is already there, in which case the | ||
355 | * state struct is freed. | ||
356 | * | ||
357 | * The tree lock is not taken internally. This is a utility function and | ||
358 | * probably isn't what you want to call (see set/clear_extent_bit). | ||
359 | */ | ||
360 | static int insert_state(struct extent_io_tree *tree, | ||
361 | struct extent_state *state, u64 start, u64 end, | ||
362 | int bits) | ||
363 | { | ||
364 | struct rb_node *node; | ||
365 | |||
366 | if (end < start) { | ||
367 | printk(KERN_ERR "btrfs end < start %llu %llu\n", | ||
368 | (unsigned long long)end, | ||
369 | (unsigned long long)start); | ||
370 | WARN_ON(1); | ||
371 | } | ||
372 | if (bits & EXTENT_DIRTY) | ||
373 | tree->dirty_bytes += end - start + 1; | ||
374 | set_state_cb(tree, state, bits); | ||
375 | state->state |= bits; | ||
376 | state->start = start; | ||
377 | state->end = end; | ||
378 | node = tree_insert(&tree->state, end, &state->rb_node); | ||
379 | if (node) { | ||
380 | struct extent_state *found; | ||
381 | found = rb_entry(node, struct extent_state, rb_node); | ||
382 | printk(KERN_ERR "btrfs found node %llu %llu on insert of " | ||
383 | "%llu %llu\n", (unsigned long long)found->start, | ||
384 | (unsigned long long)found->end, | ||
385 | (unsigned long long)start, (unsigned long long)end); | ||
386 | free_extent_state(state); | ||
387 | return -EEXIST; | ||
388 | } | ||
389 | state->tree = tree; | ||
390 | merge_state(tree, state); | ||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * split a given extent state struct in two, inserting the preallocated | ||
396 | * struct 'prealloc' as the newly created second half. 'split' indicates an | ||
397 | * offset inside 'orig' where it should be split. | ||
398 | * | ||
399 | * Before calling, | ||
400 | * the tree has 'orig' at [orig->start, orig->end]. After calling, there | ||
401 | * are two extent state structs in the tree: | ||
402 | * prealloc: [orig->start, split - 1] | ||
403 | * orig: [ split, orig->end ] | ||
404 | * | ||
405 | * The tree locks are not taken by this function. They need to be held | ||
406 | * by the caller. | ||
407 | */ | ||
408 | static int split_state(struct extent_io_tree *tree, struct extent_state *orig, | ||
409 | struct extent_state *prealloc, u64 split) | ||
410 | { | ||
411 | struct rb_node *node; | ||
412 | prealloc->start = orig->start; | ||
413 | prealloc->end = split - 1; | ||
414 | prealloc->state = orig->state; | ||
415 | orig->start = split; | ||
416 | |||
417 | node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); | ||
418 | if (node) { | ||
419 | struct extent_state *found; | ||
420 | found = rb_entry(node, struct extent_state, rb_node); | ||
421 | free_extent_state(prealloc); | ||
422 | return -EEXIST; | ||
423 | } | ||
424 | prealloc->tree = tree; | ||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | /* | ||
429 | * utility function to clear some bits in an extent state struct. | ||
430 | * it will optionally wake up any one waiting on this state (wake == 1), or | ||
431 | * forcibly remove the state from the tree (delete == 1). | ||
432 | * | ||
433 | * If no bits are set on the state struct after clearing things, the | ||
434 | * struct is freed and removed from the tree | ||
435 | */ | ||
436 | static int clear_state_bit(struct extent_io_tree *tree, | ||
437 | struct extent_state *state, int bits, int wake, | ||
438 | int delete) | ||
439 | { | ||
440 | int ret = state->state & bits; | ||
441 | |||
442 | if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { | ||
443 | u64 range = state->end - state->start + 1; | ||
444 | WARN_ON(range > tree->dirty_bytes); | ||
445 | tree->dirty_bytes -= range; | ||
446 | } | ||
447 | clear_state_cb(tree, state, bits); | ||
448 | state->state &= ~bits; | ||
449 | if (wake) | ||
450 | wake_up(&state->wq); | ||
451 | if (delete || state->state == 0) { | ||
452 | if (state->tree) { | ||
453 | clear_state_cb(tree, state, state->state); | ||
454 | rb_erase(&state->rb_node, &tree->state); | ||
455 | state->tree = NULL; | ||
456 | free_extent_state(state); | ||
457 | } else { | ||
458 | WARN_ON(1); | ||
459 | } | ||
460 | } else { | ||
461 | merge_state(tree, state); | ||
462 | } | ||
463 | return ret; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * clear some bits on a range in the tree. This may require splitting | ||
468 | * or inserting elements in the tree, so the gfp mask is used to | ||
469 | * indicate which allocations or sleeping are allowed. | ||
470 | * | ||
471 | * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove | ||
472 | * the given range from the tree regardless of state (ie for truncate). | ||
473 | * | ||
474 | * the range [start, end] is inclusive. | ||
475 | * | ||
476 | * This takes the tree lock, and returns < 0 on error, > 0 if any of the | ||
477 | * bits were already set, or zero if none of the bits were already set. | ||
478 | */ | ||
479 | int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
480 | int bits, int wake, int delete, gfp_t mask) | ||
481 | { | ||
482 | struct extent_state *state; | ||
483 | struct extent_state *prealloc = NULL; | ||
484 | struct rb_node *node; | ||
485 | int err; | ||
486 | int set = 0; | ||
487 | |||
488 | again: | ||
489 | if (!prealloc && (mask & __GFP_WAIT)) { | ||
490 | prealloc = alloc_extent_state(mask); | ||
491 | if (!prealloc) | ||
492 | return -ENOMEM; | ||
493 | } | ||
494 | |||
495 | spin_lock(&tree->lock); | ||
496 | /* | ||
497 | * this search will find the extents that end after | ||
498 | * our range starts | ||
499 | */ | ||
500 | node = tree_search(tree, start); | ||
501 | if (!node) | ||
502 | goto out; | ||
503 | state = rb_entry(node, struct extent_state, rb_node); | ||
504 | if (state->start > end) | ||
505 | goto out; | ||
506 | WARN_ON(state->end < start); | ||
507 | |||
508 | /* | ||
509 | * | ---- desired range ---- | | ||
510 | * | state | or | ||
511 | * | ------------- state -------------- | | ||
512 | * | ||
513 | * We need to split the extent we found, and may flip | ||
514 | * bits on second half. | ||
515 | * | ||
516 | * If the extent we found extends past our range, we | ||
517 | * just split and search again. It'll get split again | ||
518 | * the next time though. | ||
519 | * | ||
520 | * If the extent we found is inside our range, we clear | ||
521 | * the desired bit on it. | ||
522 | */ | ||
523 | |||
524 | if (state->start < start) { | ||
525 | if (!prealloc) | ||
526 | prealloc = alloc_extent_state(GFP_ATOMIC); | ||
527 | err = split_state(tree, state, prealloc, start); | ||
528 | BUG_ON(err == -EEXIST); | ||
529 | prealloc = NULL; | ||
530 | if (err) | ||
531 | goto out; | ||
532 | if (state->end <= end) { | ||
533 | start = state->end + 1; | ||
534 | set |= clear_state_bit(tree, state, bits, | ||
535 | wake, delete); | ||
536 | } else { | ||
537 | start = state->start; | ||
538 | } | ||
539 | goto search_again; | ||
540 | } | ||
541 | /* | ||
542 | * | ---- desired range ---- | | ||
543 | * | state | | ||
544 | * We need to split the extent, and clear the bit | ||
545 | * on the first half | ||
546 | */ | ||
547 | if (state->start <= end && state->end > end) { | ||
548 | if (!prealloc) | ||
549 | prealloc = alloc_extent_state(GFP_ATOMIC); | ||
550 | err = split_state(tree, state, prealloc, end + 1); | ||
551 | BUG_ON(err == -EEXIST); | ||
552 | |||
553 | if (wake) | ||
554 | wake_up(&state->wq); | ||
555 | set |= clear_state_bit(tree, prealloc, bits, | ||
556 | wake, delete); | ||
557 | prealloc = NULL; | ||
558 | goto out; | ||
559 | } | ||
560 | |||
561 | start = state->end + 1; | ||
562 | set |= clear_state_bit(tree, state, bits, wake, delete); | ||
563 | goto search_again; | ||
564 | |||
565 | out: | ||
566 | spin_unlock(&tree->lock); | ||
567 | if (prealloc) | ||
568 | free_extent_state(prealloc); | ||
569 | |||
570 | return set; | ||
571 | |||
572 | search_again: | ||
573 | if (start > end) | ||
574 | goto out; | ||
575 | spin_unlock(&tree->lock); | ||
576 | if (mask & __GFP_WAIT) | ||
577 | cond_resched(); | ||
578 | goto again; | ||
579 | } | ||
580 | |||
581 | static int wait_on_state(struct extent_io_tree *tree, | ||
582 | struct extent_state *state) | ||
583 | __releases(tree->lock) | ||
584 | __acquires(tree->lock) | ||
585 | { | ||
586 | DEFINE_WAIT(wait); | ||
587 | prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); | ||
588 | spin_unlock(&tree->lock); | ||
589 | schedule(); | ||
590 | spin_lock(&tree->lock); | ||
591 | finish_wait(&state->wq, &wait); | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * waits for one or more bits to clear on a range in the state tree. | ||
597 | * The range [start, end] is inclusive. | ||
598 | * The tree lock is taken by this function | ||
599 | */ | ||
600 | int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) | ||
601 | { | ||
602 | struct extent_state *state; | ||
603 | struct rb_node *node; | ||
604 | |||
605 | spin_lock(&tree->lock); | ||
606 | again: | ||
607 | while (1) { | ||
608 | /* | ||
609 | * this search will find all the extents that end after | ||
610 | * our range starts | ||
611 | */ | ||
612 | node = tree_search(tree, start); | ||
613 | if (!node) | ||
614 | break; | ||
615 | |||
616 | state = rb_entry(node, struct extent_state, rb_node); | ||
617 | |||
618 | if (state->start > end) | ||
619 | goto out; | ||
620 | |||
621 | if (state->state & bits) { | ||
622 | start = state->start; | ||
623 | atomic_inc(&state->refs); | ||
624 | wait_on_state(tree, state); | ||
625 | free_extent_state(state); | ||
626 | goto again; | ||
627 | } | ||
628 | start = state->end + 1; | ||
629 | |||
630 | if (start > end) | ||
631 | break; | ||
632 | |||
633 | if (need_resched()) { | ||
634 | spin_unlock(&tree->lock); | ||
635 | cond_resched(); | ||
636 | spin_lock(&tree->lock); | ||
637 | } | ||
638 | } | ||
639 | out: | ||
640 | spin_unlock(&tree->lock); | ||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | static void set_state_bits(struct extent_io_tree *tree, | ||
645 | struct extent_state *state, | ||
646 | int bits) | ||
647 | { | ||
648 | if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { | ||
649 | u64 range = state->end - state->start + 1; | ||
650 | tree->dirty_bytes += range; | ||
651 | } | ||
652 | set_state_cb(tree, state, bits); | ||
653 | state->state |= bits; | ||
654 | } | ||
655 | |||
656 | /* | ||
657 | * set some bits on a range in the tree. This may require allocations | ||
658 | * or sleeping, so the gfp mask is used to indicate what is allowed. | ||
659 | * | ||
660 | * If 'exclusive' == 1, this will fail with -EEXIST if some part of the | ||
661 | * range already has the desired bits set. The start of the existing | ||
662 | * range is returned in failed_start in this case. | ||
663 | * | ||
664 | * [start, end] is inclusive | ||
665 | * This takes the tree lock. | ||
666 | */ | ||
667 | static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
668 | int bits, int exclusive, u64 *failed_start, | ||
669 | gfp_t mask) | ||
670 | { | ||
671 | struct extent_state *state; | ||
672 | struct extent_state *prealloc = NULL; | ||
673 | struct rb_node *node; | ||
674 | int err = 0; | ||
675 | int set; | ||
676 | u64 last_start; | ||
677 | u64 last_end; | ||
678 | again: | ||
679 | if (!prealloc && (mask & __GFP_WAIT)) { | ||
680 | prealloc = alloc_extent_state(mask); | ||
681 | if (!prealloc) | ||
682 | return -ENOMEM; | ||
683 | } | ||
684 | |||
685 | spin_lock(&tree->lock); | ||
686 | /* | ||
687 | * this search will find all the extents that end after | ||
688 | * our range starts. | ||
689 | */ | ||
690 | node = tree_search(tree, start); | ||
691 | if (!node) { | ||
692 | err = insert_state(tree, prealloc, start, end, bits); | ||
693 | prealloc = NULL; | ||
694 | BUG_ON(err == -EEXIST); | ||
695 | goto out; | ||
696 | } | ||
697 | |||
698 | state = rb_entry(node, struct extent_state, rb_node); | ||
699 | last_start = state->start; | ||
700 | last_end = state->end; | ||
701 | |||
702 | /* | ||
703 | * | ---- desired range ---- | | ||
704 | * | state | | ||
705 | * | ||
706 | * Just lock what we found and keep going | ||
707 | */ | ||
708 | if (state->start == start && state->end <= end) { | ||
709 | set = state->state & bits; | ||
710 | if (set && exclusive) { | ||
711 | *failed_start = state->start; | ||
712 | err = -EEXIST; | ||
713 | goto out; | ||
714 | } | ||
715 | set_state_bits(tree, state, bits); | ||
716 | start = state->end + 1; | ||
717 | merge_state(tree, state); | ||
718 | goto search_again; | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * | ---- desired range ---- | | ||
723 | * | state | | ||
724 | * or | ||
725 | * | ------------- state -------------- | | ||
726 | * | ||
727 | * We need to split the extent we found, and may flip bits on | ||
728 | * second half. | ||
729 | * | ||
730 | * If the extent we found extends past our | ||
731 | * range, we just split and search again. It'll get split | ||
732 | * again the next time though. | ||
733 | * | ||
734 | * If the extent we found is inside our range, we set the | ||
735 | * desired bit on it. | ||
736 | */ | ||
737 | if (state->start < start) { | ||
738 | set = state->state & bits; | ||
739 | if (exclusive && set) { | ||
740 | *failed_start = start; | ||
741 | err = -EEXIST; | ||
742 | goto out; | ||
743 | } | ||
744 | err = split_state(tree, state, prealloc, start); | ||
745 | BUG_ON(err == -EEXIST); | ||
746 | prealloc = NULL; | ||
747 | if (err) | ||
748 | goto out; | ||
749 | if (state->end <= end) { | ||
750 | set_state_bits(tree, state, bits); | ||
751 | start = state->end + 1; | ||
752 | merge_state(tree, state); | ||
753 | } else { | ||
754 | start = state->start; | ||
755 | } | ||
756 | goto search_again; | ||
757 | } | ||
758 | /* | ||
759 | * | ---- desired range ---- | | ||
760 | * | state | or | state | | ||
761 | * | ||
762 | * There's a hole, we need to insert something in it and | ||
763 | * ignore the extent we found. | ||
764 | */ | ||
765 | if (state->start > start) { | ||
766 | u64 this_end; | ||
767 | if (end < last_start) | ||
768 | this_end = end; | ||
769 | else | ||
770 | this_end = last_start - 1; | ||
771 | err = insert_state(tree, prealloc, start, this_end, | ||
772 | bits); | ||
773 | prealloc = NULL; | ||
774 | BUG_ON(err == -EEXIST); | ||
775 | if (err) | ||
776 | goto out; | ||
777 | start = this_end + 1; | ||
778 | goto search_again; | ||
779 | } | ||
780 | /* | ||
781 | * | ---- desired range ---- | | ||
782 | * | state | | ||
783 | * We need to split the extent, and set the bit | ||
784 | * on the first half | ||
785 | */ | ||
786 | if (state->start <= end && state->end > end) { | ||
787 | set = state->state & bits; | ||
788 | if (exclusive && set) { | ||
789 | *failed_start = start; | ||
790 | err = -EEXIST; | ||
791 | goto out; | ||
792 | } | ||
793 | err = split_state(tree, state, prealloc, end + 1); | ||
794 | BUG_ON(err == -EEXIST); | ||
795 | |||
796 | set_state_bits(tree, prealloc, bits); | ||
797 | merge_state(tree, prealloc); | ||
798 | prealloc = NULL; | ||
799 | goto out; | ||
800 | } | ||
801 | |||
802 | goto search_again; | ||
803 | |||
804 | out: | ||
805 | spin_unlock(&tree->lock); | ||
806 | if (prealloc) | ||
807 | free_extent_state(prealloc); | ||
808 | |||
809 | return err; | ||
810 | |||
811 | search_again: | ||
812 | if (start > end) | ||
813 | goto out; | ||
814 | spin_unlock(&tree->lock); | ||
815 | if (mask & __GFP_WAIT) | ||
816 | cond_resched(); | ||
817 | goto again; | ||
818 | } | ||
819 | |||
820 | /* wrappers around set/clear extent bit */ | ||
821 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | ||
822 | gfp_t mask) | ||
823 | { | ||
824 | return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, | ||
825 | mask); | ||
826 | } | ||
827 | |||
828 | int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, | ||
829 | gfp_t mask) | ||
830 | { | ||
831 | return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); | ||
832 | } | ||
833 | |||
834 | int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | ||
835 | int bits, gfp_t mask) | ||
836 | { | ||
837 | return set_extent_bit(tree, start, end, bits, 0, NULL, | ||
838 | mask); | ||
839 | } | ||
840 | |||
841 | int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | ||
842 | int bits, gfp_t mask) | ||
843 | { | ||
844 | return clear_extent_bit(tree, start, end, bits, 0, 0, mask); | ||
845 | } | ||
846 | |||
847 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | ||
848 | gfp_t mask) | ||
849 | { | ||
850 | return set_extent_bit(tree, start, end, | ||
851 | EXTENT_DELALLOC | EXTENT_DIRTY, | ||
852 | 0, NULL, mask); | ||
853 | } | ||
854 | |||
855 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | ||
856 | gfp_t mask) | ||
857 | { | ||
858 | return clear_extent_bit(tree, start, end, | ||
859 | EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); | ||
860 | } | ||
861 | |||
862 | int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, | ||
863 | gfp_t mask) | ||
864 | { | ||
865 | return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); | ||
866 | } | ||
867 | |||
868 | int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, | ||
869 | gfp_t mask) | ||
870 | { | ||
871 | return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, | ||
872 | mask); | ||
873 | } | ||
874 | |||
875 | static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, | ||
876 | gfp_t mask) | ||
877 | { | ||
878 | return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); | ||
879 | } | ||
880 | |||
881 | int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, | ||
882 | gfp_t mask) | ||
883 | { | ||
884 | return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, | ||
885 | mask); | ||
886 | } | ||
887 | |||
888 | static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, | ||
889 | u64 end, gfp_t mask) | ||
890 | { | ||
891 | return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); | ||
892 | } | ||
893 | |||
894 | static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, | ||
895 | gfp_t mask) | ||
896 | { | ||
897 | return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, | ||
898 | 0, NULL, mask); | ||
899 | } | ||
900 | |||
901 | static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, | ||
902 | u64 end, gfp_t mask) | ||
903 | { | ||
904 | return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); | ||
905 | } | ||
906 | |||
907 | int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) | ||
908 | { | ||
909 | return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * either insert or lock state struct between start and end use mask to tell | ||
914 | * us if waiting is desired. | ||
915 | */ | ||
916 | int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) | ||
917 | { | ||
918 | int err; | ||
919 | u64 failed_start; | ||
920 | while (1) { | ||
921 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, | ||
922 | &failed_start, mask); | ||
923 | if (err == -EEXIST && (mask & __GFP_WAIT)) { | ||
924 | wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); | ||
925 | start = failed_start; | ||
926 | } else { | ||
927 | break; | ||
928 | } | ||
929 | WARN_ON(start > end); | ||
930 | } | ||
931 | return err; | ||
932 | } | ||
933 | |||
934 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, | ||
935 | gfp_t mask) | ||
936 | { | ||
937 | int err; | ||
938 | u64 failed_start; | ||
939 | |||
940 | err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, | ||
941 | &failed_start, mask); | ||
942 | if (err == -EEXIST) { | ||
943 | if (failed_start > start) | ||
944 | clear_extent_bit(tree, start, failed_start - 1, | ||
945 | EXTENT_LOCKED, 1, 0, mask); | ||
946 | return 0; | ||
947 | } | ||
948 | return 1; | ||
949 | } | ||
950 | |||
951 | int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, | ||
952 | gfp_t mask) | ||
953 | { | ||
954 | return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); | ||
955 | } | ||
956 | |||
957 | /* | ||
958 | * helper function to set pages and extents in the tree dirty | ||
959 | */ | ||
960 | int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) | ||
961 | { | ||
962 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
963 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
964 | struct page *page; | ||
965 | |||
966 | while (index <= end_index) { | ||
967 | page = find_get_page(tree->mapping, index); | ||
968 | BUG_ON(!page); | ||
969 | __set_page_dirty_nobuffers(page); | ||
970 | page_cache_release(page); | ||
971 | index++; | ||
972 | } | ||
973 | set_extent_dirty(tree, start, end, GFP_NOFS); | ||
974 | return 0; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * helper function to set both pages and extents in the tree writeback | ||
979 | */ | ||
980 | static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) | ||
981 | { | ||
982 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
983 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
984 | struct page *page; | ||
985 | |||
986 | while (index <= end_index) { | ||
987 | page = find_get_page(tree->mapping, index); | ||
988 | BUG_ON(!page); | ||
989 | set_page_writeback(page); | ||
990 | page_cache_release(page); | ||
991 | index++; | ||
992 | } | ||
993 | set_extent_writeback(tree, start, end, GFP_NOFS); | ||
994 | return 0; | ||
995 | } | ||
996 | |||
997 | /* | ||
998 | * find the first offset in the io tree with 'bits' set. zero is | ||
999 | * returned if we find something, and *start_ret and *end_ret are | ||
1000 | * set to reflect the state struct that was found. | ||
1001 | * | ||
1002 | * If nothing was found, 1 is returned, < 0 on error | ||
1003 | */ | ||
1004 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | ||
1005 | u64 *start_ret, u64 *end_ret, int bits) | ||
1006 | { | ||
1007 | struct rb_node *node; | ||
1008 | struct extent_state *state; | ||
1009 | int ret = 1; | ||
1010 | |||
1011 | spin_lock(&tree->lock); | ||
1012 | /* | ||
1013 | * this search will find all the extents that end after | ||
1014 | * our range starts. | ||
1015 | */ | ||
1016 | node = tree_search(tree, start); | ||
1017 | if (!node) | ||
1018 | goto out; | ||
1019 | |||
1020 | while (1) { | ||
1021 | state = rb_entry(node, struct extent_state, rb_node); | ||
1022 | if (state->end >= start && (state->state & bits)) { | ||
1023 | *start_ret = state->start; | ||
1024 | *end_ret = state->end; | ||
1025 | ret = 0; | ||
1026 | break; | ||
1027 | } | ||
1028 | node = rb_next(node); | ||
1029 | if (!node) | ||
1030 | break; | ||
1031 | } | ||
1032 | out: | ||
1033 | spin_unlock(&tree->lock); | ||
1034 | return ret; | ||
1035 | } | ||
1036 | |||
1037 | /* find the first state struct with 'bits' set after 'start', and | ||
1038 | * return it. tree->lock must be held. NULL will returned if | ||
1039 | * nothing was found after 'start' | ||
1040 | */ | ||
1041 | struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, | ||
1042 | u64 start, int bits) | ||
1043 | { | ||
1044 | struct rb_node *node; | ||
1045 | struct extent_state *state; | ||
1046 | |||
1047 | /* | ||
1048 | * this search will find all the extents that end after | ||
1049 | * our range starts. | ||
1050 | */ | ||
1051 | node = tree_search(tree, start); | ||
1052 | if (!node) | ||
1053 | goto out; | ||
1054 | |||
1055 | while (1) { | ||
1056 | state = rb_entry(node, struct extent_state, rb_node); | ||
1057 | if (state->end >= start && (state->state & bits)) | ||
1058 | return state; | ||
1059 | |||
1060 | node = rb_next(node); | ||
1061 | if (!node) | ||
1062 | break; | ||
1063 | } | ||
1064 | out: | ||
1065 | return NULL; | ||
1066 | } | ||
1067 | |||
1068 | /* | ||
1069 | * find a contiguous range of bytes in the file marked as delalloc, not | ||
1070 | * more than 'max_bytes'. start and end are used to return the range, | ||
1071 | * | ||
1072 | * 1 is returned if we find something, 0 if nothing was in the tree | ||
1073 | */ | ||
1074 | static noinline u64 find_delalloc_range(struct extent_io_tree *tree, | ||
1075 | u64 *start, u64 *end, u64 max_bytes) | ||
1076 | { | ||
1077 | struct rb_node *node; | ||
1078 | struct extent_state *state; | ||
1079 | u64 cur_start = *start; | ||
1080 | u64 found = 0; | ||
1081 | u64 total_bytes = 0; | ||
1082 | |||
1083 | spin_lock(&tree->lock); | ||
1084 | |||
1085 | /* | ||
1086 | * this search will find all the extents that end after | ||
1087 | * our range starts. | ||
1088 | */ | ||
1089 | node = tree_search(tree, cur_start); | ||
1090 | if (!node) { | ||
1091 | if (!found) | ||
1092 | *end = (u64)-1; | ||
1093 | goto out; | ||
1094 | } | ||
1095 | |||
1096 | while (1) { | ||
1097 | state = rb_entry(node, struct extent_state, rb_node); | ||
1098 | if (found && (state->start != cur_start || | ||
1099 | (state->state & EXTENT_BOUNDARY))) { | ||
1100 | goto out; | ||
1101 | } | ||
1102 | if (!(state->state & EXTENT_DELALLOC)) { | ||
1103 | if (!found) | ||
1104 | *end = state->end; | ||
1105 | goto out; | ||
1106 | } | ||
1107 | if (!found) | ||
1108 | *start = state->start; | ||
1109 | found++; | ||
1110 | *end = state->end; | ||
1111 | cur_start = state->end + 1; | ||
1112 | node = rb_next(node); | ||
1113 | if (!node) | ||
1114 | break; | ||
1115 | total_bytes += state->end - state->start + 1; | ||
1116 | if (total_bytes >= max_bytes) | ||
1117 | break; | ||
1118 | } | ||
1119 | out: | ||
1120 | spin_unlock(&tree->lock); | ||
1121 | return found; | ||
1122 | } | ||
1123 | |||
1124 | static noinline int __unlock_for_delalloc(struct inode *inode, | ||
1125 | struct page *locked_page, | ||
1126 | u64 start, u64 end) | ||
1127 | { | ||
1128 | int ret; | ||
1129 | struct page *pages[16]; | ||
1130 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1131 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1132 | unsigned long nr_pages = end_index - index + 1; | ||
1133 | int i; | ||
1134 | |||
1135 | if (index == locked_page->index && end_index == index) | ||
1136 | return 0; | ||
1137 | |||
1138 | while (nr_pages > 0) { | ||
1139 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1140 | min_t(unsigned long, nr_pages, | ||
1141 | ARRAY_SIZE(pages)), pages); | ||
1142 | for (i = 0; i < ret; i++) { | ||
1143 | if (pages[i] != locked_page) | ||
1144 | unlock_page(pages[i]); | ||
1145 | page_cache_release(pages[i]); | ||
1146 | } | ||
1147 | nr_pages -= ret; | ||
1148 | index += ret; | ||
1149 | cond_resched(); | ||
1150 | } | ||
1151 | return 0; | ||
1152 | } | ||
1153 | |||
1154 | static noinline int lock_delalloc_pages(struct inode *inode, | ||
1155 | struct page *locked_page, | ||
1156 | u64 delalloc_start, | ||
1157 | u64 delalloc_end) | ||
1158 | { | ||
1159 | unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; | ||
1160 | unsigned long start_index = index; | ||
1161 | unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; | ||
1162 | unsigned long pages_locked = 0; | ||
1163 | struct page *pages[16]; | ||
1164 | unsigned long nrpages; | ||
1165 | int ret; | ||
1166 | int i; | ||
1167 | |||
1168 | /* the caller is responsible for locking the start index */ | ||
1169 | if (index == locked_page->index && index == end_index) | ||
1170 | return 0; | ||
1171 | |||
1172 | /* skip the page at the start index */ | ||
1173 | nrpages = end_index - index + 1; | ||
1174 | while (nrpages > 0) { | ||
1175 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1176 | min_t(unsigned long, | ||
1177 | nrpages, ARRAY_SIZE(pages)), pages); | ||
1178 | if (ret == 0) { | ||
1179 | ret = -EAGAIN; | ||
1180 | goto done; | ||
1181 | } | ||
1182 | /* now we have an array of pages, lock them all */ | ||
1183 | for (i = 0; i < ret; i++) { | ||
1184 | /* | ||
1185 | * the caller is taking responsibility for | ||
1186 | * locked_page | ||
1187 | */ | ||
1188 | if (pages[i] != locked_page) { | ||
1189 | lock_page(pages[i]); | ||
1190 | if (!PageDirty(pages[i]) || | ||
1191 | pages[i]->mapping != inode->i_mapping) { | ||
1192 | ret = -EAGAIN; | ||
1193 | unlock_page(pages[i]); | ||
1194 | page_cache_release(pages[i]); | ||
1195 | goto done; | ||
1196 | } | ||
1197 | } | ||
1198 | page_cache_release(pages[i]); | ||
1199 | pages_locked++; | ||
1200 | } | ||
1201 | nrpages -= ret; | ||
1202 | index += ret; | ||
1203 | cond_resched(); | ||
1204 | } | ||
1205 | ret = 0; | ||
1206 | done: | ||
1207 | if (ret && pages_locked) { | ||
1208 | __unlock_for_delalloc(inode, locked_page, | ||
1209 | delalloc_start, | ||
1210 | ((u64)(start_index + pages_locked - 1)) << | ||
1211 | PAGE_CACHE_SHIFT); | ||
1212 | } | ||
1213 | return ret; | ||
1214 | } | ||
1215 | |||
1216 | /* | ||
1217 | * find a contiguous range of bytes in the file marked as delalloc, not | ||
1218 | * more than 'max_bytes'. start and end are used to return the range, | ||
1219 | * | ||
1220 | * 1 is returned if we find something, 0 if nothing was in the tree | ||
1221 | */ | ||
1222 | static noinline u64 find_lock_delalloc_range(struct inode *inode, | ||
1223 | struct extent_io_tree *tree, | ||
1224 | struct page *locked_page, | ||
1225 | u64 *start, u64 *end, | ||
1226 | u64 max_bytes) | ||
1227 | { | ||
1228 | u64 delalloc_start; | ||
1229 | u64 delalloc_end; | ||
1230 | u64 found; | ||
1231 | int ret; | ||
1232 | int loops = 0; | ||
1233 | |||
1234 | again: | ||
1235 | /* step one, find a bunch of delalloc bytes starting at start */ | ||
1236 | delalloc_start = *start; | ||
1237 | delalloc_end = 0; | ||
1238 | found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, | ||
1239 | max_bytes); | ||
1240 | if (!found || delalloc_end <= *start) { | ||
1241 | *start = delalloc_start; | ||
1242 | *end = delalloc_end; | ||
1243 | return found; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * start comes from the offset of locked_page. We have to lock | ||
1248 | * pages in order, so we can't process delalloc bytes before | ||
1249 | * locked_page | ||
1250 | */ | ||
1251 | if (delalloc_start < *start) | ||
1252 | delalloc_start = *start; | ||
1253 | |||
1254 | /* | ||
1255 | * make sure to limit the number of pages we try to lock down | ||
1256 | * if we're looping. | ||
1257 | */ | ||
1258 | if (delalloc_end + 1 - delalloc_start > max_bytes && loops) | ||
1259 | delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; | ||
1260 | |||
1261 | /* step two, lock all the pages after the page that has start */ | ||
1262 | ret = lock_delalloc_pages(inode, locked_page, | ||
1263 | delalloc_start, delalloc_end); | ||
1264 | if (ret == -EAGAIN) { | ||
1265 | /* some of the pages are gone, lets avoid looping by | ||
1266 | * shortening the size of the delalloc range we're searching | ||
1267 | */ | ||
1268 | if (!loops) { | ||
1269 | unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); | ||
1270 | max_bytes = PAGE_CACHE_SIZE - offset; | ||
1271 | loops = 1; | ||
1272 | goto again; | ||
1273 | } else { | ||
1274 | found = 0; | ||
1275 | goto out_failed; | ||
1276 | } | ||
1277 | } | ||
1278 | BUG_ON(ret); | ||
1279 | |||
1280 | /* step three, lock the state bits for the whole range */ | ||
1281 | lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); | ||
1282 | |||
1283 | /* then test to make sure it is all still delalloc */ | ||
1284 | ret = test_range_bit(tree, delalloc_start, delalloc_end, | ||
1285 | EXTENT_DELALLOC, 1); | ||
1286 | if (!ret) { | ||
1287 | unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); | ||
1288 | __unlock_for_delalloc(inode, locked_page, | ||
1289 | delalloc_start, delalloc_end); | ||
1290 | cond_resched(); | ||
1291 | goto again; | ||
1292 | } | ||
1293 | *start = delalloc_start; | ||
1294 | *end = delalloc_end; | ||
1295 | out_failed: | ||
1296 | return found; | ||
1297 | } | ||
1298 | |||
1299 | int extent_clear_unlock_delalloc(struct inode *inode, | ||
1300 | struct extent_io_tree *tree, | ||
1301 | u64 start, u64 end, struct page *locked_page, | ||
1302 | int unlock_pages, | ||
1303 | int clear_unlock, | ||
1304 | int clear_delalloc, int clear_dirty, | ||
1305 | int set_writeback, | ||
1306 | int end_writeback) | ||
1307 | { | ||
1308 | int ret; | ||
1309 | struct page *pages[16]; | ||
1310 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1311 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1312 | unsigned long nr_pages = end_index - index + 1; | ||
1313 | int i; | ||
1314 | int clear_bits = 0; | ||
1315 | |||
1316 | if (clear_unlock) | ||
1317 | clear_bits |= EXTENT_LOCKED; | ||
1318 | if (clear_dirty) | ||
1319 | clear_bits |= EXTENT_DIRTY; | ||
1320 | |||
1321 | if (clear_delalloc) | ||
1322 | clear_bits |= EXTENT_DELALLOC; | ||
1323 | |||
1324 | clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); | ||
1325 | if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) | ||
1326 | return 0; | ||
1327 | |||
1328 | while (nr_pages > 0) { | ||
1329 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1330 | min_t(unsigned long, | ||
1331 | nr_pages, ARRAY_SIZE(pages)), pages); | ||
1332 | for (i = 0; i < ret; i++) { | ||
1333 | if (pages[i] == locked_page) { | ||
1334 | page_cache_release(pages[i]); | ||
1335 | continue; | ||
1336 | } | ||
1337 | if (clear_dirty) | ||
1338 | clear_page_dirty_for_io(pages[i]); | ||
1339 | if (set_writeback) | ||
1340 | set_page_writeback(pages[i]); | ||
1341 | if (end_writeback) | ||
1342 | end_page_writeback(pages[i]); | ||
1343 | if (unlock_pages) | ||
1344 | unlock_page(pages[i]); | ||
1345 | page_cache_release(pages[i]); | ||
1346 | } | ||
1347 | nr_pages -= ret; | ||
1348 | index += ret; | ||
1349 | cond_resched(); | ||
1350 | } | ||
1351 | return 0; | ||
1352 | } | ||
1353 | |||
1354 | /* | ||
1355 | * count the number of bytes in the tree that have a given bit(s) | ||
1356 | * set. This can be fairly slow, except for EXTENT_DIRTY which is | ||
1357 | * cached. The total number found is returned. | ||
1358 | */ | ||
1359 | u64 count_range_bits(struct extent_io_tree *tree, | ||
1360 | u64 *start, u64 search_end, u64 max_bytes, | ||
1361 | unsigned long bits) | ||
1362 | { | ||
1363 | struct rb_node *node; | ||
1364 | struct extent_state *state; | ||
1365 | u64 cur_start = *start; | ||
1366 | u64 total_bytes = 0; | ||
1367 | int found = 0; | ||
1368 | |||
1369 | if (search_end <= cur_start) { | ||
1370 | WARN_ON(1); | ||
1371 | return 0; | ||
1372 | } | ||
1373 | |||
1374 | spin_lock(&tree->lock); | ||
1375 | if (cur_start == 0 && bits == EXTENT_DIRTY) { | ||
1376 | total_bytes = tree->dirty_bytes; | ||
1377 | goto out; | ||
1378 | } | ||
1379 | /* | ||
1380 | * this search will find all the extents that end after | ||
1381 | * our range starts. | ||
1382 | */ | ||
1383 | node = tree_search(tree, cur_start); | ||
1384 | if (!node) | ||
1385 | goto out; | ||
1386 | |||
1387 | while (1) { | ||
1388 | state = rb_entry(node, struct extent_state, rb_node); | ||
1389 | if (state->start > search_end) | ||
1390 | break; | ||
1391 | if (state->end >= cur_start && (state->state & bits)) { | ||
1392 | total_bytes += min(search_end, state->end) + 1 - | ||
1393 | max(cur_start, state->start); | ||
1394 | if (total_bytes >= max_bytes) | ||
1395 | break; | ||
1396 | if (!found) { | ||
1397 | *start = state->start; | ||
1398 | found = 1; | ||
1399 | } | ||
1400 | } | ||
1401 | node = rb_next(node); | ||
1402 | if (!node) | ||
1403 | break; | ||
1404 | } | ||
1405 | out: | ||
1406 | spin_unlock(&tree->lock); | ||
1407 | return total_bytes; | ||
1408 | } | ||
1409 | |||
1410 | #if 0 | ||
1411 | /* | ||
1412 | * helper function to lock both pages and extents in the tree. | ||
1413 | * pages must be locked first. | ||
1414 | */ | ||
1415 | static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) | ||
1416 | { | ||
1417 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1418 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1419 | struct page *page; | ||
1420 | int err; | ||
1421 | |||
1422 | while (index <= end_index) { | ||
1423 | page = grab_cache_page(tree->mapping, index); | ||
1424 | if (!page) { | ||
1425 | err = -ENOMEM; | ||
1426 | goto failed; | ||
1427 | } | ||
1428 | if (IS_ERR(page)) { | ||
1429 | err = PTR_ERR(page); | ||
1430 | goto failed; | ||
1431 | } | ||
1432 | index++; | ||
1433 | } | ||
1434 | lock_extent(tree, start, end, GFP_NOFS); | ||
1435 | return 0; | ||
1436 | |||
1437 | failed: | ||
1438 | /* | ||
1439 | * we failed above in getting the page at 'index', so we undo here | ||
1440 | * up to but not including the page at 'index' | ||
1441 | */ | ||
1442 | end_index = index; | ||
1443 | index = start >> PAGE_CACHE_SHIFT; | ||
1444 | while (index < end_index) { | ||
1445 | page = find_get_page(tree->mapping, index); | ||
1446 | unlock_page(page); | ||
1447 | page_cache_release(page); | ||
1448 | index++; | ||
1449 | } | ||
1450 | return err; | ||
1451 | } | ||
1452 | |||
1453 | /* | ||
1454 | * helper function to unlock both pages and extents in the tree. | ||
1455 | */ | ||
1456 | static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) | ||
1457 | { | ||
1458 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1459 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1460 | struct page *page; | ||
1461 | |||
1462 | while (index <= end_index) { | ||
1463 | page = find_get_page(tree->mapping, index); | ||
1464 | unlock_page(page); | ||
1465 | page_cache_release(page); | ||
1466 | index++; | ||
1467 | } | ||
1468 | unlock_extent(tree, start, end, GFP_NOFS); | ||
1469 | return 0; | ||
1470 | } | ||
1471 | #endif | ||
1472 | |||
1473 | /* | ||
1474 | * set the private field for a given byte offset in the tree. If there isn't | ||
1475 | * an extent_state there already, this does nothing. | ||
1476 | */ | ||
1477 | int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) | ||
1478 | { | ||
1479 | struct rb_node *node; | ||
1480 | struct extent_state *state; | ||
1481 | int ret = 0; | ||
1482 | |||
1483 | spin_lock(&tree->lock); | ||
1484 | /* | ||
1485 | * this search will find all the extents that end after | ||
1486 | * our range starts. | ||
1487 | */ | ||
1488 | node = tree_search(tree, start); | ||
1489 | if (!node) { | ||
1490 | ret = -ENOENT; | ||
1491 | goto out; | ||
1492 | } | ||
1493 | state = rb_entry(node, struct extent_state, rb_node); | ||
1494 | if (state->start != start) { | ||
1495 | ret = -ENOENT; | ||
1496 | goto out; | ||
1497 | } | ||
1498 | state->private = private; | ||
1499 | out: | ||
1500 | spin_unlock(&tree->lock); | ||
1501 | return ret; | ||
1502 | } | ||
1503 | |||
1504 | int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) | ||
1505 | { | ||
1506 | struct rb_node *node; | ||
1507 | struct extent_state *state; | ||
1508 | int ret = 0; | ||
1509 | |||
1510 | spin_lock(&tree->lock); | ||
1511 | /* | ||
1512 | * this search will find all the extents that end after | ||
1513 | * our range starts. | ||
1514 | */ | ||
1515 | node = tree_search(tree, start); | ||
1516 | if (!node) { | ||
1517 | ret = -ENOENT; | ||
1518 | goto out; | ||
1519 | } | ||
1520 | state = rb_entry(node, struct extent_state, rb_node); | ||
1521 | if (state->start != start) { | ||
1522 | ret = -ENOENT; | ||
1523 | goto out; | ||
1524 | } | ||
1525 | *private = state->private; | ||
1526 | out: | ||
1527 | spin_unlock(&tree->lock); | ||
1528 | return ret; | ||
1529 | } | ||
1530 | |||
1531 | /* | ||
1532 | * searches a range in the state tree for a given mask. | ||
1533 | * If 'filled' == 1, this returns 1 only if every extent in the tree | ||
1534 | * has the bits set. Otherwise, 1 is returned if any bit in the | ||
1535 | * range is found set. | ||
1536 | */ | ||
1537 | int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
1538 | int bits, int filled) | ||
1539 | { | ||
1540 | struct extent_state *state = NULL; | ||
1541 | struct rb_node *node; | ||
1542 | int bitset = 0; | ||
1543 | |||
1544 | spin_lock(&tree->lock); | ||
1545 | node = tree_search(tree, start); | ||
1546 | while (node && start <= end) { | ||
1547 | state = rb_entry(node, struct extent_state, rb_node); | ||
1548 | |||
1549 | if (filled && state->start > start) { | ||
1550 | bitset = 0; | ||
1551 | break; | ||
1552 | } | ||
1553 | |||
1554 | if (state->start > end) | ||
1555 | break; | ||
1556 | |||
1557 | if (state->state & bits) { | ||
1558 | bitset = 1; | ||
1559 | if (!filled) | ||
1560 | break; | ||
1561 | } else if (filled) { | ||
1562 | bitset = 0; | ||
1563 | break; | ||
1564 | } | ||
1565 | start = state->end + 1; | ||
1566 | if (start > end) | ||
1567 | break; | ||
1568 | node = rb_next(node); | ||
1569 | if (!node) { | ||
1570 | if (filled) | ||
1571 | bitset = 0; | ||
1572 | break; | ||
1573 | } | ||
1574 | } | ||
1575 | spin_unlock(&tree->lock); | ||
1576 | return bitset; | ||
1577 | } | ||
1578 | |||
1579 | /* | ||
1580 | * helper function to set a given page up to date if all the | ||
1581 | * extents in the tree for that page are up to date | ||
1582 | */ | ||
1583 | static int check_page_uptodate(struct extent_io_tree *tree, | ||
1584 | struct page *page) | ||
1585 | { | ||
1586 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
1587 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
1588 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) | ||
1589 | SetPageUptodate(page); | ||
1590 | return 0; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * helper function to unlock a page if all the extents in the tree | ||
1595 | * for that page are unlocked | ||
1596 | */ | ||
1597 | static int check_page_locked(struct extent_io_tree *tree, | ||
1598 | struct page *page) | ||
1599 | { | ||
1600 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
1601 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
1602 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) | ||
1603 | unlock_page(page); | ||
1604 | return 0; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * helper function to end page writeback if all the extents | ||
1609 | * in the tree for that page are done with writeback | ||
1610 | */ | ||
1611 | static int check_page_writeback(struct extent_io_tree *tree, | ||
1612 | struct page *page) | ||
1613 | { | ||
1614 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
1615 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
1616 | if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) | ||
1617 | end_page_writeback(page); | ||
1618 | return 0; | ||
1619 | } | ||
1620 | |||
1621 | /* lots and lots of room for performance fixes in the end_bio funcs */ | ||
1622 | |||
1623 | /* | ||
1624 | * after a writepage IO is done, we need to: | ||
1625 | * clear the uptodate bits on error | ||
1626 | * clear the writeback bits in the extent tree for this IO | ||
1627 | * end_page_writeback if the page has no more pending IO | ||
1628 | * | ||
1629 | * Scheduling is not allowed, so the extent state tree is expected | ||
1630 | * to have one and only one object corresponding to this IO. | ||
1631 | */ | ||
1632 | static void end_bio_extent_writepage(struct bio *bio, int err) | ||
1633 | { | ||
1634 | int uptodate = err == 0; | ||
1635 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
1636 | struct extent_io_tree *tree; | ||
1637 | u64 start; | ||
1638 | u64 end; | ||
1639 | int whole_page; | ||
1640 | int ret; | ||
1641 | |||
1642 | do { | ||
1643 | struct page *page = bvec->bv_page; | ||
1644 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
1645 | |||
1646 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | ||
1647 | bvec->bv_offset; | ||
1648 | end = start + bvec->bv_len - 1; | ||
1649 | |||
1650 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | ||
1651 | whole_page = 1; | ||
1652 | else | ||
1653 | whole_page = 0; | ||
1654 | |||
1655 | if (--bvec >= bio->bi_io_vec) | ||
1656 | prefetchw(&bvec->bv_page->flags); | ||
1657 | if (tree->ops && tree->ops->writepage_end_io_hook) { | ||
1658 | ret = tree->ops->writepage_end_io_hook(page, start, | ||
1659 | end, NULL, uptodate); | ||
1660 | if (ret) | ||
1661 | uptodate = 0; | ||
1662 | } | ||
1663 | |||
1664 | if (!uptodate && tree->ops && | ||
1665 | tree->ops->writepage_io_failed_hook) { | ||
1666 | ret = tree->ops->writepage_io_failed_hook(bio, page, | ||
1667 | start, end, NULL); | ||
1668 | if (ret == 0) { | ||
1669 | uptodate = (err == 0); | ||
1670 | continue; | ||
1671 | } | ||
1672 | } | ||
1673 | |||
1674 | if (!uptodate) { | ||
1675 | clear_extent_uptodate(tree, start, end, GFP_ATOMIC); | ||
1676 | ClearPageUptodate(page); | ||
1677 | SetPageError(page); | ||
1678 | } | ||
1679 | |||
1680 | clear_extent_writeback(tree, start, end, GFP_ATOMIC); | ||
1681 | |||
1682 | if (whole_page) | ||
1683 | end_page_writeback(page); | ||
1684 | else | ||
1685 | check_page_writeback(tree, page); | ||
1686 | } while (bvec >= bio->bi_io_vec); | ||
1687 | |||
1688 | bio_put(bio); | ||
1689 | } | ||
1690 | |||
1691 | /* | ||
1692 | * after a readpage IO is done, we need to: | ||
1693 | * clear the uptodate bits on error | ||
1694 | * set the uptodate bits if things worked | ||
1695 | * set the page up to date if all extents in the tree are uptodate | ||
1696 | * clear the lock bit in the extent tree | ||
1697 | * unlock the page if there are no other extents locked for it | ||
1698 | * | ||
1699 | * Scheduling is not allowed, so the extent state tree is expected | ||
1700 | * to have one and only one object corresponding to this IO. | ||
1701 | */ | ||
1702 | static void end_bio_extent_readpage(struct bio *bio, int err) | ||
1703 | { | ||
1704 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1705 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
1706 | struct extent_io_tree *tree; | ||
1707 | u64 start; | ||
1708 | u64 end; | ||
1709 | int whole_page; | ||
1710 | int ret; | ||
1711 | |||
1712 | if (err) | ||
1713 | uptodate = 0; | ||
1714 | |||
1715 | do { | ||
1716 | struct page *page = bvec->bv_page; | ||
1717 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
1718 | |||
1719 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | ||
1720 | bvec->bv_offset; | ||
1721 | end = start + bvec->bv_len - 1; | ||
1722 | |||
1723 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | ||
1724 | whole_page = 1; | ||
1725 | else | ||
1726 | whole_page = 0; | ||
1727 | |||
1728 | if (--bvec >= bio->bi_io_vec) | ||
1729 | prefetchw(&bvec->bv_page->flags); | ||
1730 | |||
1731 | if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { | ||
1732 | ret = tree->ops->readpage_end_io_hook(page, start, end, | ||
1733 | NULL); | ||
1734 | if (ret) | ||
1735 | uptodate = 0; | ||
1736 | } | ||
1737 | if (!uptodate && tree->ops && | ||
1738 | tree->ops->readpage_io_failed_hook) { | ||
1739 | ret = tree->ops->readpage_io_failed_hook(bio, page, | ||
1740 | start, end, NULL); | ||
1741 | if (ret == 0) { | ||
1742 | uptodate = | ||
1743 | test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1744 | if (err) | ||
1745 | uptodate = 0; | ||
1746 | continue; | ||
1747 | } | ||
1748 | } | ||
1749 | |||
1750 | if (uptodate) { | ||
1751 | set_extent_uptodate(tree, start, end, | ||
1752 | GFP_ATOMIC); | ||
1753 | } | ||
1754 | unlock_extent(tree, start, end, GFP_ATOMIC); | ||
1755 | |||
1756 | if (whole_page) { | ||
1757 | if (uptodate) { | ||
1758 | SetPageUptodate(page); | ||
1759 | } else { | ||
1760 | ClearPageUptodate(page); | ||
1761 | SetPageError(page); | ||
1762 | } | ||
1763 | unlock_page(page); | ||
1764 | } else { | ||
1765 | if (uptodate) { | ||
1766 | check_page_uptodate(tree, page); | ||
1767 | } else { | ||
1768 | ClearPageUptodate(page); | ||
1769 | SetPageError(page); | ||
1770 | } | ||
1771 | check_page_locked(tree, page); | ||
1772 | } | ||
1773 | } while (bvec >= bio->bi_io_vec); | ||
1774 | |||
1775 | bio_put(bio); | ||
1776 | } | ||
1777 | |||
1778 | /* | ||
1779 | * IO done from prepare_write is pretty simple, we just unlock | ||
1780 | * the structs in the extent tree when done, and set the uptodate bits | ||
1781 | * as appropriate. | ||
1782 | */ | ||
1783 | static void end_bio_extent_preparewrite(struct bio *bio, int err) | ||
1784 | { | ||
1785 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1786 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
1787 | struct extent_io_tree *tree; | ||
1788 | u64 start; | ||
1789 | u64 end; | ||
1790 | |||
1791 | do { | ||
1792 | struct page *page = bvec->bv_page; | ||
1793 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
1794 | |||
1795 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | ||
1796 | bvec->bv_offset; | ||
1797 | end = start + bvec->bv_len - 1; | ||
1798 | |||
1799 | if (--bvec >= bio->bi_io_vec) | ||
1800 | prefetchw(&bvec->bv_page->flags); | ||
1801 | |||
1802 | if (uptodate) { | ||
1803 | set_extent_uptodate(tree, start, end, GFP_ATOMIC); | ||
1804 | } else { | ||
1805 | ClearPageUptodate(page); | ||
1806 | SetPageError(page); | ||
1807 | } | ||
1808 | |||
1809 | unlock_extent(tree, start, end, GFP_ATOMIC); | ||
1810 | |||
1811 | } while (bvec >= bio->bi_io_vec); | ||
1812 | |||
1813 | bio_put(bio); | ||
1814 | } | ||
1815 | |||
1816 | static struct bio * | ||
1817 | extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | ||
1818 | gfp_t gfp_flags) | ||
1819 | { | ||
1820 | struct bio *bio; | ||
1821 | |||
1822 | bio = bio_alloc(gfp_flags, nr_vecs); | ||
1823 | |||
1824 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { | ||
1825 | while (!bio && (nr_vecs /= 2)) | ||
1826 | bio = bio_alloc(gfp_flags, nr_vecs); | ||
1827 | } | ||
1828 | |||
1829 | if (bio) { | ||
1830 | bio->bi_size = 0; | ||
1831 | bio->bi_bdev = bdev; | ||
1832 | bio->bi_sector = first_sector; | ||
1833 | } | ||
1834 | return bio; | ||
1835 | } | ||
1836 | |||
1837 | static int submit_one_bio(int rw, struct bio *bio, int mirror_num, | ||
1838 | unsigned long bio_flags) | ||
1839 | { | ||
1840 | int ret = 0; | ||
1841 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
1842 | struct page *page = bvec->bv_page; | ||
1843 | struct extent_io_tree *tree = bio->bi_private; | ||
1844 | u64 start; | ||
1845 | u64 end; | ||
1846 | |||
1847 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; | ||
1848 | end = start + bvec->bv_len - 1; | ||
1849 | |||
1850 | bio->bi_private = NULL; | ||
1851 | |||
1852 | bio_get(bio); | ||
1853 | |||
1854 | if (tree->ops && tree->ops->submit_bio_hook) | ||
1855 | tree->ops->submit_bio_hook(page->mapping->host, rw, bio, | ||
1856 | mirror_num, bio_flags); | ||
1857 | else | ||
1858 | submit_bio(rw, bio); | ||
1859 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
1860 | ret = -EOPNOTSUPP; | ||
1861 | bio_put(bio); | ||
1862 | return ret; | ||
1863 | } | ||
1864 | |||
1865 | static int submit_extent_page(int rw, struct extent_io_tree *tree, | ||
1866 | struct page *page, sector_t sector, | ||
1867 | size_t size, unsigned long offset, | ||
1868 | struct block_device *bdev, | ||
1869 | struct bio **bio_ret, | ||
1870 | unsigned long max_pages, | ||
1871 | bio_end_io_t end_io_func, | ||
1872 | int mirror_num, | ||
1873 | unsigned long prev_bio_flags, | ||
1874 | unsigned long bio_flags) | ||
1875 | { | ||
1876 | int ret = 0; | ||
1877 | struct bio *bio; | ||
1878 | int nr; | ||
1879 | int contig = 0; | ||
1880 | int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; | ||
1881 | int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; | ||
1882 | size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); | ||
1883 | |||
1884 | if (bio_ret && *bio_ret) { | ||
1885 | bio = *bio_ret; | ||
1886 | if (old_compressed) | ||
1887 | contig = bio->bi_sector == sector; | ||
1888 | else | ||
1889 | contig = bio->bi_sector + (bio->bi_size >> 9) == | ||
1890 | sector; | ||
1891 | |||
1892 | if (prev_bio_flags != bio_flags || !contig || | ||
1893 | (tree->ops && tree->ops->merge_bio_hook && | ||
1894 | tree->ops->merge_bio_hook(page, offset, page_size, bio, | ||
1895 | bio_flags)) || | ||
1896 | bio_add_page(bio, page, page_size, offset) < page_size) { | ||
1897 | ret = submit_one_bio(rw, bio, mirror_num, | ||
1898 | prev_bio_flags); | ||
1899 | bio = NULL; | ||
1900 | } else { | ||
1901 | return 0; | ||
1902 | } | ||
1903 | } | ||
1904 | if (this_compressed) | ||
1905 | nr = BIO_MAX_PAGES; | ||
1906 | else | ||
1907 | nr = bio_get_nr_vecs(bdev); | ||
1908 | |||
1909 | bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); | ||
1910 | |||
1911 | bio_add_page(bio, page, page_size, offset); | ||
1912 | bio->bi_end_io = end_io_func; | ||
1913 | bio->bi_private = tree; | ||
1914 | |||
1915 | if (bio_ret) | ||
1916 | *bio_ret = bio; | ||
1917 | else | ||
1918 | ret = submit_one_bio(rw, bio, mirror_num, bio_flags); | ||
1919 | |||
1920 | return ret; | ||
1921 | } | ||
1922 | |||
1923 | void set_page_extent_mapped(struct page *page) | ||
1924 | { | ||
1925 | if (!PagePrivate(page)) { | ||
1926 | SetPagePrivate(page); | ||
1927 | page_cache_get(page); | ||
1928 | set_page_private(page, EXTENT_PAGE_PRIVATE); | ||
1929 | } | ||
1930 | } | ||
1931 | |||
1932 | static void set_page_extent_head(struct page *page, unsigned long len) | ||
1933 | { | ||
1934 | set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); | ||
1935 | } | ||
1936 | |||
1937 | /* | ||
1938 | * basic readpage implementation. Locked extent state structs are inserted | ||
1939 | * into the tree that are removed when the IO is done (by the end_io | ||
1940 | * handlers) | ||
1941 | */ | ||
1942 | static int __extent_read_full_page(struct extent_io_tree *tree, | ||
1943 | struct page *page, | ||
1944 | get_extent_t *get_extent, | ||
1945 | struct bio **bio, int mirror_num, | ||
1946 | unsigned long *bio_flags) | ||
1947 | { | ||
1948 | struct inode *inode = page->mapping->host; | ||
1949 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
1950 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | ||
1951 | u64 end; | ||
1952 | u64 cur = start; | ||
1953 | u64 extent_offset; | ||
1954 | u64 last_byte = i_size_read(inode); | ||
1955 | u64 block_start; | ||
1956 | u64 cur_end; | ||
1957 | sector_t sector; | ||
1958 | struct extent_map *em; | ||
1959 | struct block_device *bdev; | ||
1960 | int ret; | ||
1961 | int nr = 0; | ||
1962 | size_t page_offset = 0; | ||
1963 | size_t iosize; | ||
1964 | size_t disk_io_size; | ||
1965 | size_t blocksize = inode->i_sb->s_blocksize; | ||
1966 | unsigned long this_bio_flag = 0; | ||
1967 | |||
1968 | set_page_extent_mapped(page); | ||
1969 | |||
1970 | end = page_end; | ||
1971 | lock_extent(tree, start, end, GFP_NOFS); | ||
1972 | |||
1973 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { | ||
1974 | char *userpage; | ||
1975 | size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); | ||
1976 | |||
1977 | if (zero_offset) { | ||
1978 | iosize = PAGE_CACHE_SIZE - zero_offset; | ||
1979 | userpage = kmap_atomic(page, KM_USER0); | ||
1980 | memset(userpage + zero_offset, 0, iosize); | ||
1981 | flush_dcache_page(page); | ||
1982 | kunmap_atomic(userpage, KM_USER0); | ||
1983 | } | ||
1984 | } | ||
1985 | while (cur <= end) { | ||
1986 | if (cur >= last_byte) { | ||
1987 | char *userpage; | ||
1988 | iosize = PAGE_CACHE_SIZE - page_offset; | ||
1989 | userpage = kmap_atomic(page, KM_USER0); | ||
1990 | memset(userpage + page_offset, 0, iosize); | ||
1991 | flush_dcache_page(page); | ||
1992 | kunmap_atomic(userpage, KM_USER0); | ||
1993 | set_extent_uptodate(tree, cur, cur + iosize - 1, | ||
1994 | GFP_NOFS); | ||
1995 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | ||
1996 | break; | ||
1997 | } | ||
1998 | em = get_extent(inode, page, page_offset, cur, | ||
1999 | end - cur + 1, 0); | ||
2000 | if (IS_ERR(em) || !em) { | ||
2001 | SetPageError(page); | ||
2002 | unlock_extent(tree, cur, end, GFP_NOFS); | ||
2003 | break; | ||
2004 | } | ||
2005 | extent_offset = cur - em->start; | ||
2006 | BUG_ON(extent_map_end(em) <= cur); | ||
2007 | BUG_ON(end < cur); | ||
2008 | |||
2009 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) | ||
2010 | this_bio_flag = EXTENT_BIO_COMPRESSED; | ||
2011 | |||
2012 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | ||
2013 | cur_end = min(extent_map_end(em) - 1, end); | ||
2014 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | ||
2015 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { | ||
2016 | disk_io_size = em->block_len; | ||
2017 | sector = em->block_start >> 9; | ||
2018 | } else { | ||
2019 | sector = (em->block_start + extent_offset) >> 9; | ||
2020 | disk_io_size = iosize; | ||
2021 | } | ||
2022 | bdev = em->bdev; | ||
2023 | block_start = em->block_start; | ||
2024 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) | ||
2025 | block_start = EXTENT_MAP_HOLE; | ||
2026 | free_extent_map(em); | ||
2027 | em = NULL; | ||
2028 | |||
2029 | /* we've found a hole, just zero and go on */ | ||
2030 | if (block_start == EXTENT_MAP_HOLE) { | ||
2031 | char *userpage; | ||
2032 | userpage = kmap_atomic(page, KM_USER0); | ||
2033 | memset(userpage + page_offset, 0, iosize); | ||
2034 | flush_dcache_page(page); | ||
2035 | kunmap_atomic(userpage, KM_USER0); | ||
2036 | |||
2037 | set_extent_uptodate(tree, cur, cur + iosize - 1, | ||
2038 | GFP_NOFS); | ||
2039 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | ||
2040 | cur = cur + iosize; | ||
2041 | page_offset += iosize; | ||
2042 | continue; | ||
2043 | } | ||
2044 | /* the get_extent function already copied into the page */ | ||
2045 | if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { | ||
2046 | check_page_uptodate(tree, page); | ||
2047 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | ||
2048 | cur = cur + iosize; | ||
2049 | page_offset += iosize; | ||
2050 | continue; | ||
2051 | } | ||
2052 | /* we have an inline extent but it didn't get marked up | ||
2053 | * to date. Error out | ||
2054 | */ | ||
2055 | if (block_start == EXTENT_MAP_INLINE) { | ||
2056 | SetPageError(page); | ||
2057 | unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); | ||
2058 | cur = cur + iosize; | ||
2059 | page_offset += iosize; | ||
2060 | continue; | ||
2061 | } | ||
2062 | |||
2063 | ret = 0; | ||
2064 | if (tree->ops && tree->ops->readpage_io_hook) { | ||
2065 | ret = tree->ops->readpage_io_hook(page, cur, | ||
2066 | cur + iosize - 1); | ||
2067 | } | ||
2068 | if (!ret) { | ||
2069 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | ||
2070 | pnr -= page->index; | ||
2071 | ret = submit_extent_page(READ, tree, page, | ||
2072 | sector, disk_io_size, page_offset, | ||
2073 | bdev, bio, pnr, | ||
2074 | end_bio_extent_readpage, mirror_num, | ||
2075 | *bio_flags, | ||
2076 | this_bio_flag); | ||
2077 | nr++; | ||
2078 | *bio_flags = this_bio_flag; | ||
2079 | } | ||
2080 | if (ret) | ||
2081 | SetPageError(page); | ||
2082 | cur = cur + iosize; | ||
2083 | page_offset += iosize; | ||
2084 | } | ||
2085 | if (!nr) { | ||
2086 | if (!PageError(page)) | ||
2087 | SetPageUptodate(page); | ||
2088 | unlock_page(page); | ||
2089 | } | ||
2090 | return 0; | ||
2091 | } | ||
2092 | |||
2093 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | ||
2094 | get_extent_t *get_extent) | ||
2095 | { | ||
2096 | struct bio *bio = NULL; | ||
2097 | unsigned long bio_flags = 0; | ||
2098 | int ret; | ||
2099 | |||
2100 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, | ||
2101 | &bio_flags); | ||
2102 | if (bio) | ||
2103 | submit_one_bio(READ, bio, 0, bio_flags); | ||
2104 | return ret; | ||
2105 | } | ||
2106 | |||
2107 | /* | ||
2108 | * the writepage semantics are similar to regular writepage. extent | ||
2109 | * records are inserted to lock ranges in the tree, and as dirty areas | ||
2110 | * are found, they are marked writeback. Then the lock bits are removed | ||
2111 | * and the end_io handler clears the writeback ranges | ||
2112 | */ | ||
2113 | static int __extent_writepage(struct page *page, struct writeback_control *wbc, | ||
2114 | void *data) | ||
2115 | { | ||
2116 | struct inode *inode = page->mapping->host; | ||
2117 | struct extent_page_data *epd = data; | ||
2118 | struct extent_io_tree *tree = epd->tree; | ||
2119 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
2120 | u64 delalloc_start; | ||
2121 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | ||
2122 | u64 end; | ||
2123 | u64 cur = start; | ||
2124 | u64 extent_offset; | ||
2125 | u64 last_byte = i_size_read(inode); | ||
2126 | u64 block_start; | ||
2127 | u64 iosize; | ||
2128 | u64 unlock_start; | ||
2129 | sector_t sector; | ||
2130 | struct extent_map *em; | ||
2131 | struct block_device *bdev; | ||
2132 | int ret; | ||
2133 | int nr = 0; | ||
2134 | size_t pg_offset = 0; | ||
2135 | size_t blocksize; | ||
2136 | loff_t i_size = i_size_read(inode); | ||
2137 | unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; | ||
2138 | u64 nr_delalloc; | ||
2139 | u64 delalloc_end; | ||
2140 | int page_started; | ||
2141 | int compressed; | ||
2142 | unsigned long nr_written = 0; | ||
2143 | |||
2144 | WARN_ON(!PageLocked(page)); | ||
2145 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | ||
2146 | if (page->index > end_index || | ||
2147 | (page->index == end_index && !pg_offset)) { | ||
2148 | page->mapping->a_ops->invalidatepage(page, 0); | ||
2149 | unlock_page(page); | ||
2150 | return 0; | ||
2151 | } | ||
2152 | |||
2153 | if (page->index == end_index) { | ||
2154 | char *userpage; | ||
2155 | |||
2156 | userpage = kmap_atomic(page, KM_USER0); | ||
2157 | memset(userpage + pg_offset, 0, | ||
2158 | PAGE_CACHE_SIZE - pg_offset); | ||
2159 | kunmap_atomic(userpage, KM_USER0); | ||
2160 | flush_dcache_page(page); | ||
2161 | } | ||
2162 | pg_offset = 0; | ||
2163 | |||
2164 | set_page_extent_mapped(page); | ||
2165 | |||
2166 | delalloc_start = start; | ||
2167 | delalloc_end = 0; | ||
2168 | page_started = 0; | ||
2169 | if (!epd->extent_locked) { | ||
2170 | while (delalloc_end < page_end) { | ||
2171 | nr_delalloc = find_lock_delalloc_range(inode, tree, | ||
2172 | page, | ||
2173 | &delalloc_start, | ||
2174 | &delalloc_end, | ||
2175 | 128 * 1024 * 1024); | ||
2176 | if (nr_delalloc == 0) { | ||
2177 | delalloc_start = delalloc_end + 1; | ||
2178 | continue; | ||
2179 | } | ||
2180 | tree->ops->fill_delalloc(inode, page, delalloc_start, | ||
2181 | delalloc_end, &page_started, | ||
2182 | &nr_written); | ||
2183 | delalloc_start = delalloc_end + 1; | ||
2184 | } | ||
2185 | |||
2186 | /* did the fill delalloc function already unlock and start | ||
2187 | * the IO? | ||
2188 | */ | ||
2189 | if (page_started) { | ||
2190 | ret = 0; | ||
2191 | goto update_nr_written; | ||
2192 | } | ||
2193 | } | ||
2194 | lock_extent(tree, start, page_end, GFP_NOFS); | ||
2195 | |||
2196 | unlock_start = start; | ||
2197 | |||
2198 | if (tree->ops && tree->ops->writepage_start_hook) { | ||
2199 | ret = tree->ops->writepage_start_hook(page, start, | ||
2200 | page_end); | ||
2201 | if (ret == -EAGAIN) { | ||
2202 | unlock_extent(tree, start, page_end, GFP_NOFS); | ||
2203 | redirty_page_for_writepage(wbc, page); | ||
2204 | unlock_page(page); | ||
2205 | ret = 0; | ||
2206 | goto update_nr_written; | ||
2207 | } | ||
2208 | } | ||
2209 | |||
2210 | nr_written++; | ||
2211 | |||
2212 | end = page_end; | ||
2213 | if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) | ||
2214 | printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); | ||
2215 | |||
2216 | if (last_byte <= start) { | ||
2217 | clear_extent_dirty(tree, start, page_end, GFP_NOFS); | ||
2218 | unlock_extent(tree, start, page_end, GFP_NOFS); | ||
2219 | if (tree->ops && tree->ops->writepage_end_io_hook) | ||
2220 | tree->ops->writepage_end_io_hook(page, start, | ||
2221 | page_end, NULL, 1); | ||
2222 | unlock_start = page_end + 1; | ||
2223 | goto done; | ||
2224 | } | ||
2225 | |||
2226 | set_extent_uptodate(tree, start, page_end, GFP_NOFS); | ||
2227 | blocksize = inode->i_sb->s_blocksize; | ||
2228 | |||
2229 | while (cur <= end) { | ||
2230 | if (cur >= last_byte) { | ||
2231 | clear_extent_dirty(tree, cur, page_end, GFP_NOFS); | ||
2232 | unlock_extent(tree, unlock_start, page_end, GFP_NOFS); | ||
2233 | if (tree->ops && tree->ops->writepage_end_io_hook) | ||
2234 | tree->ops->writepage_end_io_hook(page, cur, | ||
2235 | page_end, NULL, 1); | ||
2236 | unlock_start = page_end + 1; | ||
2237 | break; | ||
2238 | } | ||
2239 | em = epd->get_extent(inode, page, pg_offset, cur, | ||
2240 | end - cur + 1, 1); | ||
2241 | if (IS_ERR(em) || !em) { | ||
2242 | SetPageError(page); | ||
2243 | break; | ||
2244 | } | ||
2245 | |||
2246 | extent_offset = cur - em->start; | ||
2247 | BUG_ON(extent_map_end(em) <= cur); | ||
2248 | BUG_ON(end < cur); | ||
2249 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | ||
2250 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | ||
2251 | sector = (em->block_start + extent_offset) >> 9; | ||
2252 | bdev = em->bdev; | ||
2253 | block_start = em->block_start; | ||
2254 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
2255 | free_extent_map(em); | ||
2256 | em = NULL; | ||
2257 | |||
2258 | /* | ||
2259 | * compressed and inline extents are written through other | ||
2260 | * paths in the FS | ||
2261 | */ | ||
2262 | if (compressed || block_start == EXTENT_MAP_HOLE || | ||
2263 | block_start == EXTENT_MAP_INLINE) { | ||
2264 | clear_extent_dirty(tree, cur, | ||
2265 | cur + iosize - 1, GFP_NOFS); | ||
2266 | |||
2267 | unlock_extent(tree, unlock_start, cur + iosize - 1, | ||
2268 | GFP_NOFS); | ||
2269 | |||
2270 | /* | ||
2271 | * end_io notification does not happen here for | ||
2272 | * compressed extents | ||
2273 | */ | ||
2274 | if (!compressed && tree->ops && | ||
2275 | tree->ops->writepage_end_io_hook) | ||
2276 | tree->ops->writepage_end_io_hook(page, cur, | ||
2277 | cur + iosize - 1, | ||
2278 | NULL, 1); | ||
2279 | else if (compressed) { | ||
2280 | /* we don't want to end_page_writeback on | ||
2281 | * a compressed extent. this happens | ||
2282 | * elsewhere | ||
2283 | */ | ||
2284 | nr++; | ||
2285 | } | ||
2286 | |||
2287 | cur += iosize; | ||
2288 | pg_offset += iosize; | ||
2289 | unlock_start = cur; | ||
2290 | continue; | ||
2291 | } | ||
2292 | /* leave this out until we have a page_mkwrite call */ | ||
2293 | if (0 && !test_range_bit(tree, cur, cur + iosize - 1, | ||
2294 | EXTENT_DIRTY, 0)) { | ||
2295 | cur = cur + iosize; | ||
2296 | pg_offset += iosize; | ||
2297 | continue; | ||
2298 | } | ||
2299 | |||
2300 | clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); | ||
2301 | if (tree->ops && tree->ops->writepage_io_hook) { | ||
2302 | ret = tree->ops->writepage_io_hook(page, cur, | ||
2303 | cur + iosize - 1); | ||
2304 | } else { | ||
2305 | ret = 0; | ||
2306 | } | ||
2307 | if (ret) { | ||
2308 | SetPageError(page); | ||
2309 | } else { | ||
2310 | unsigned long max_nr = end_index + 1; | ||
2311 | |||
2312 | set_range_writeback(tree, cur, cur + iosize - 1); | ||
2313 | if (!PageWriteback(page)) { | ||
2314 | printk(KERN_ERR "btrfs warning page %lu not " | ||
2315 | "writeback, cur %llu end %llu\n", | ||
2316 | page->index, (unsigned long long)cur, | ||
2317 | (unsigned long long)end); | ||
2318 | } | ||
2319 | |||
2320 | ret = submit_extent_page(WRITE, tree, page, sector, | ||
2321 | iosize, pg_offset, bdev, | ||
2322 | &epd->bio, max_nr, | ||
2323 | end_bio_extent_writepage, | ||
2324 | 0, 0, 0); | ||
2325 | if (ret) | ||
2326 | SetPageError(page); | ||
2327 | } | ||
2328 | cur = cur + iosize; | ||
2329 | pg_offset += iosize; | ||
2330 | nr++; | ||
2331 | } | ||
2332 | done: | ||
2333 | if (nr == 0) { | ||
2334 | /* make sure the mapping tag for page dirty gets cleared */ | ||
2335 | set_page_writeback(page); | ||
2336 | end_page_writeback(page); | ||
2337 | } | ||
2338 | if (unlock_start <= page_end) | ||
2339 | unlock_extent(tree, unlock_start, page_end, GFP_NOFS); | ||
2340 | unlock_page(page); | ||
2341 | |||
2342 | update_nr_written: | ||
2343 | wbc->nr_to_write -= nr_written; | ||
2344 | if (wbc->range_cyclic || (wbc->nr_to_write > 0 && | ||
2345 | wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) | ||
2346 | page->mapping->writeback_index = page->index + nr_written; | ||
2347 | return 0; | ||
2348 | } | ||
2349 | |||
2350 | /** | ||
2351 | * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. | ||
2352 | * @mapping: address space structure to write | ||
2353 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | ||
2354 | * @writepage: function called for each page | ||
2355 | * @data: data passed to writepage function | ||
2356 | * | ||
2357 | * If a page is already under I/O, write_cache_pages() skips it, even | ||
2358 | * if it's dirty. This is desirable behaviour for memory-cleaning writeback, | ||
2359 | * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() | ||
2360 | * and msync() need to guarantee that all the data which was dirty at the time | ||
2361 | * the call was made get new I/O started against them. If wbc->sync_mode is | ||
2362 | * WB_SYNC_ALL then we were called for data integrity and we must wait for | ||
2363 | * existing IO to complete. | ||
2364 | */ | ||
2365 | static int extent_write_cache_pages(struct extent_io_tree *tree, | ||
2366 | struct address_space *mapping, | ||
2367 | struct writeback_control *wbc, | ||
2368 | writepage_t writepage, void *data, | ||
2369 | void (*flush_fn)(void *)) | ||
2370 | { | ||
2371 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
2372 | int ret = 0; | ||
2373 | int done = 0; | ||
2374 | struct pagevec pvec; | ||
2375 | int nr_pages; | ||
2376 | pgoff_t index; | ||
2377 | pgoff_t end; /* Inclusive */ | ||
2378 | int scanned = 0; | ||
2379 | int range_whole = 0; | ||
2380 | |||
2381 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
2382 | wbc->encountered_congestion = 1; | ||
2383 | return 0; | ||
2384 | } | ||
2385 | |||
2386 | pagevec_init(&pvec, 0); | ||
2387 | if (wbc->range_cyclic) { | ||
2388 | index = mapping->writeback_index; /* Start from prev offset */ | ||
2389 | end = -1; | ||
2390 | } else { | ||
2391 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | ||
2392 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | ||
2393 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | ||
2394 | range_whole = 1; | ||
2395 | scanned = 1; | ||
2396 | } | ||
2397 | retry: | ||
2398 | while (!done && (index <= end) && | ||
2399 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
2400 | PAGECACHE_TAG_DIRTY, min(end - index, | ||
2401 | (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | ||
2402 | unsigned i; | ||
2403 | |||
2404 | scanned = 1; | ||
2405 | for (i = 0; i < nr_pages; i++) { | ||
2406 | struct page *page = pvec.pages[i]; | ||
2407 | |||
2408 | /* | ||
2409 | * At this point we hold neither mapping->tree_lock nor | ||
2410 | * lock on the page itself: the page may be truncated or | ||
2411 | * invalidated (changing page->mapping to NULL), or even | ||
2412 | * swizzled back from swapper_space to tmpfs file | ||
2413 | * mapping | ||
2414 | */ | ||
2415 | if (tree->ops && tree->ops->write_cache_pages_lock_hook) | ||
2416 | tree->ops->write_cache_pages_lock_hook(page); | ||
2417 | else | ||
2418 | lock_page(page); | ||
2419 | |||
2420 | if (unlikely(page->mapping != mapping)) { | ||
2421 | unlock_page(page); | ||
2422 | continue; | ||
2423 | } | ||
2424 | |||
2425 | if (!wbc->range_cyclic && page->index > end) { | ||
2426 | done = 1; | ||
2427 | unlock_page(page); | ||
2428 | continue; | ||
2429 | } | ||
2430 | |||
2431 | if (wbc->sync_mode != WB_SYNC_NONE) { | ||
2432 | if (PageWriteback(page)) | ||
2433 | flush_fn(data); | ||
2434 | wait_on_page_writeback(page); | ||
2435 | } | ||
2436 | |||
2437 | if (PageWriteback(page) || | ||
2438 | !clear_page_dirty_for_io(page)) { | ||
2439 | unlock_page(page); | ||
2440 | continue; | ||
2441 | } | ||
2442 | |||
2443 | ret = (*writepage)(page, wbc, data); | ||
2444 | |||
2445 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { | ||
2446 | unlock_page(page); | ||
2447 | ret = 0; | ||
2448 | } | ||
2449 | if (ret || wbc->nr_to_write <= 0) | ||
2450 | done = 1; | ||
2451 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
2452 | wbc->encountered_congestion = 1; | ||
2453 | done = 1; | ||
2454 | } | ||
2455 | } | ||
2456 | pagevec_release(&pvec); | ||
2457 | cond_resched(); | ||
2458 | } | ||
2459 | if (!scanned && !done) { | ||
2460 | /* | ||
2461 | * We hit the last page and there is more work to be done: wrap | ||
2462 | * back to the start of the file | ||
2463 | */ | ||
2464 | scanned = 1; | ||
2465 | index = 0; | ||
2466 | goto retry; | ||
2467 | } | ||
2468 | return ret; | ||
2469 | } | ||
2470 | |||
2471 | static noinline void flush_write_bio(void *data) | ||
2472 | { | ||
2473 | struct extent_page_data *epd = data; | ||
2474 | if (epd->bio) { | ||
2475 | submit_one_bio(WRITE, epd->bio, 0, 0); | ||
2476 | epd->bio = NULL; | ||
2477 | } | ||
2478 | } | ||
2479 | |||
2480 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | ||
2481 | get_extent_t *get_extent, | ||
2482 | struct writeback_control *wbc) | ||
2483 | { | ||
2484 | int ret; | ||
2485 | struct address_space *mapping = page->mapping; | ||
2486 | struct extent_page_data epd = { | ||
2487 | .bio = NULL, | ||
2488 | .tree = tree, | ||
2489 | .get_extent = get_extent, | ||
2490 | .extent_locked = 0, | ||
2491 | }; | ||
2492 | struct writeback_control wbc_writepages = { | ||
2493 | .bdi = wbc->bdi, | ||
2494 | .sync_mode = WB_SYNC_NONE, | ||
2495 | .older_than_this = NULL, | ||
2496 | .nr_to_write = 64, | ||
2497 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, | ||
2498 | .range_end = (loff_t)-1, | ||
2499 | }; | ||
2500 | |||
2501 | |||
2502 | ret = __extent_writepage(page, wbc, &epd); | ||
2503 | |||
2504 | extent_write_cache_pages(tree, mapping, &wbc_writepages, | ||
2505 | __extent_writepage, &epd, flush_write_bio); | ||
2506 | if (epd.bio) | ||
2507 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2508 | return ret; | ||
2509 | } | ||
2510 | |||
2511 | int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | ||
2512 | u64 start, u64 end, get_extent_t *get_extent, | ||
2513 | int mode) | ||
2514 | { | ||
2515 | int ret = 0; | ||
2516 | struct address_space *mapping = inode->i_mapping; | ||
2517 | struct page *page; | ||
2518 | unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> | ||
2519 | PAGE_CACHE_SHIFT; | ||
2520 | |||
2521 | struct extent_page_data epd = { | ||
2522 | .bio = NULL, | ||
2523 | .tree = tree, | ||
2524 | .get_extent = get_extent, | ||
2525 | .extent_locked = 1, | ||
2526 | }; | ||
2527 | struct writeback_control wbc_writepages = { | ||
2528 | .bdi = inode->i_mapping->backing_dev_info, | ||
2529 | .sync_mode = mode, | ||
2530 | .older_than_this = NULL, | ||
2531 | .nr_to_write = nr_pages * 2, | ||
2532 | .range_start = start, | ||
2533 | .range_end = end + 1, | ||
2534 | }; | ||
2535 | |||
2536 | while (start <= end) { | ||
2537 | page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); | ||
2538 | if (clear_page_dirty_for_io(page)) | ||
2539 | ret = __extent_writepage(page, &wbc_writepages, &epd); | ||
2540 | else { | ||
2541 | if (tree->ops && tree->ops->writepage_end_io_hook) | ||
2542 | tree->ops->writepage_end_io_hook(page, start, | ||
2543 | start + PAGE_CACHE_SIZE - 1, | ||
2544 | NULL, 1); | ||
2545 | unlock_page(page); | ||
2546 | } | ||
2547 | page_cache_release(page); | ||
2548 | start += PAGE_CACHE_SIZE; | ||
2549 | } | ||
2550 | |||
2551 | if (epd.bio) | ||
2552 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2553 | return ret; | ||
2554 | } | ||
2555 | |||
2556 | int extent_writepages(struct extent_io_tree *tree, | ||
2557 | struct address_space *mapping, | ||
2558 | get_extent_t *get_extent, | ||
2559 | struct writeback_control *wbc) | ||
2560 | { | ||
2561 | int ret = 0; | ||
2562 | struct extent_page_data epd = { | ||
2563 | .bio = NULL, | ||
2564 | .tree = tree, | ||
2565 | .get_extent = get_extent, | ||
2566 | .extent_locked = 0, | ||
2567 | }; | ||
2568 | |||
2569 | ret = extent_write_cache_pages(tree, mapping, wbc, | ||
2570 | __extent_writepage, &epd, | ||
2571 | flush_write_bio); | ||
2572 | if (epd.bio) | ||
2573 | submit_one_bio(WRITE, epd.bio, 0, 0); | ||
2574 | return ret; | ||
2575 | } | ||
2576 | |||
2577 | int extent_readpages(struct extent_io_tree *tree, | ||
2578 | struct address_space *mapping, | ||
2579 | struct list_head *pages, unsigned nr_pages, | ||
2580 | get_extent_t get_extent) | ||
2581 | { | ||
2582 | struct bio *bio = NULL; | ||
2583 | unsigned page_idx; | ||
2584 | struct pagevec pvec; | ||
2585 | unsigned long bio_flags = 0; | ||
2586 | |||
2587 | pagevec_init(&pvec, 0); | ||
2588 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | ||
2589 | struct page *page = list_entry(pages->prev, struct page, lru); | ||
2590 | |||
2591 | prefetchw(&page->flags); | ||
2592 | list_del(&page->lru); | ||
2593 | /* | ||
2594 | * what we want to do here is call add_to_page_cache_lru, | ||
2595 | * but that isn't exported, so we reproduce it here | ||
2596 | */ | ||
2597 | if (!add_to_page_cache(page, mapping, | ||
2598 | page->index, GFP_KERNEL)) { | ||
2599 | |||
2600 | /* open coding of lru_cache_add, also not exported */ | ||
2601 | page_cache_get(page); | ||
2602 | if (!pagevec_add(&pvec, page)) | ||
2603 | __pagevec_lru_add_file(&pvec); | ||
2604 | __extent_read_full_page(tree, page, get_extent, | ||
2605 | &bio, 0, &bio_flags); | ||
2606 | } | ||
2607 | page_cache_release(page); | ||
2608 | } | ||
2609 | if (pagevec_count(&pvec)) | ||
2610 | __pagevec_lru_add_file(&pvec); | ||
2611 | BUG_ON(!list_empty(pages)); | ||
2612 | if (bio) | ||
2613 | submit_one_bio(READ, bio, 0, bio_flags); | ||
2614 | return 0; | ||
2615 | } | ||
2616 | |||
2617 | /* | ||
2618 | * basic invalidatepage code, this waits on any locked or writeback | ||
2619 | * ranges corresponding to the page, and then deletes any extent state | ||
2620 | * records from the tree | ||
2621 | */ | ||
2622 | int extent_invalidatepage(struct extent_io_tree *tree, | ||
2623 | struct page *page, unsigned long offset) | ||
2624 | { | ||
2625 | u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); | ||
2626 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
2627 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; | ||
2628 | |||
2629 | start += (offset + blocksize - 1) & ~(blocksize - 1); | ||
2630 | if (start > end) | ||
2631 | return 0; | ||
2632 | |||
2633 | lock_extent(tree, start, end, GFP_NOFS); | ||
2634 | wait_on_extent_writeback(tree, start, end); | ||
2635 | clear_extent_bit(tree, start, end, | ||
2636 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, | ||
2637 | 1, 1, GFP_NOFS); | ||
2638 | return 0; | ||
2639 | } | ||
2640 | |||
2641 | /* | ||
2642 | * simple commit_write call, set_range_dirty is used to mark both | ||
2643 | * the pages and the extent records as dirty | ||
2644 | */ | ||
2645 | int extent_commit_write(struct extent_io_tree *tree, | ||
2646 | struct inode *inode, struct page *page, | ||
2647 | unsigned from, unsigned to) | ||
2648 | { | ||
2649 | loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; | ||
2650 | |||
2651 | set_page_extent_mapped(page); | ||
2652 | set_page_dirty(page); | ||
2653 | |||
2654 | if (pos > inode->i_size) { | ||
2655 | i_size_write(inode, pos); | ||
2656 | mark_inode_dirty(inode); | ||
2657 | } | ||
2658 | return 0; | ||
2659 | } | ||
2660 | |||
2661 | int extent_prepare_write(struct extent_io_tree *tree, | ||
2662 | struct inode *inode, struct page *page, | ||
2663 | unsigned from, unsigned to, get_extent_t *get_extent) | ||
2664 | { | ||
2665 | u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
2666 | u64 page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
2667 | u64 block_start; | ||
2668 | u64 orig_block_start; | ||
2669 | u64 block_end; | ||
2670 | u64 cur_end; | ||
2671 | struct extent_map *em; | ||
2672 | unsigned blocksize = 1 << inode->i_blkbits; | ||
2673 | size_t page_offset = 0; | ||
2674 | size_t block_off_start; | ||
2675 | size_t block_off_end; | ||
2676 | int err = 0; | ||
2677 | int iocount = 0; | ||
2678 | int ret = 0; | ||
2679 | int isnew; | ||
2680 | |||
2681 | set_page_extent_mapped(page); | ||
2682 | |||
2683 | block_start = (page_start + from) & ~((u64)blocksize - 1); | ||
2684 | block_end = (page_start + to - 1) | (blocksize - 1); | ||
2685 | orig_block_start = block_start; | ||
2686 | |||
2687 | lock_extent(tree, page_start, page_end, GFP_NOFS); | ||
2688 | while (block_start <= block_end) { | ||
2689 | em = get_extent(inode, page, page_offset, block_start, | ||
2690 | block_end - block_start + 1, 1); | ||
2691 | if (IS_ERR(em) || !em) | ||
2692 | goto err; | ||
2693 | |||
2694 | cur_end = min(block_end, extent_map_end(em) - 1); | ||
2695 | block_off_start = block_start & (PAGE_CACHE_SIZE - 1); | ||
2696 | block_off_end = block_off_start + blocksize; | ||
2697 | isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); | ||
2698 | |||
2699 | if (!PageUptodate(page) && isnew && | ||
2700 | (block_off_end > to || block_off_start < from)) { | ||
2701 | void *kaddr; | ||
2702 | |||
2703 | kaddr = kmap_atomic(page, KM_USER0); | ||
2704 | if (block_off_end > to) | ||
2705 | memset(kaddr + to, 0, block_off_end - to); | ||
2706 | if (block_off_start < from) | ||
2707 | memset(kaddr + block_off_start, 0, | ||
2708 | from - block_off_start); | ||
2709 | flush_dcache_page(page); | ||
2710 | kunmap_atomic(kaddr, KM_USER0); | ||
2711 | } | ||
2712 | if ((em->block_start != EXTENT_MAP_HOLE && | ||
2713 | em->block_start != EXTENT_MAP_INLINE) && | ||
2714 | !isnew && !PageUptodate(page) && | ||
2715 | (block_off_end > to || block_off_start < from) && | ||
2716 | !test_range_bit(tree, block_start, cur_end, | ||
2717 | EXTENT_UPTODATE, 1)) { | ||
2718 | u64 sector; | ||
2719 | u64 extent_offset = block_start - em->start; | ||
2720 | size_t iosize; | ||
2721 | sector = (em->block_start + extent_offset) >> 9; | ||
2722 | iosize = (cur_end - block_start + blocksize) & | ||
2723 | ~((u64)blocksize - 1); | ||
2724 | /* | ||
2725 | * we've already got the extent locked, but we | ||
2726 | * need to split the state such that our end_bio | ||
2727 | * handler can clear the lock. | ||
2728 | */ | ||
2729 | set_extent_bit(tree, block_start, | ||
2730 | block_start + iosize - 1, | ||
2731 | EXTENT_LOCKED, 0, NULL, GFP_NOFS); | ||
2732 | ret = submit_extent_page(READ, tree, page, | ||
2733 | sector, iosize, page_offset, em->bdev, | ||
2734 | NULL, 1, | ||
2735 | end_bio_extent_preparewrite, 0, | ||
2736 | 0, 0); | ||
2737 | iocount++; | ||
2738 | block_start = block_start + iosize; | ||
2739 | } else { | ||
2740 | set_extent_uptodate(tree, block_start, cur_end, | ||
2741 | GFP_NOFS); | ||
2742 | unlock_extent(tree, block_start, cur_end, GFP_NOFS); | ||
2743 | block_start = cur_end + 1; | ||
2744 | } | ||
2745 | page_offset = block_start & (PAGE_CACHE_SIZE - 1); | ||
2746 | free_extent_map(em); | ||
2747 | } | ||
2748 | if (iocount) { | ||
2749 | wait_extent_bit(tree, orig_block_start, | ||
2750 | block_end, EXTENT_LOCKED); | ||
2751 | } | ||
2752 | check_page_uptodate(tree, page); | ||
2753 | err: | ||
2754 | /* FIXME, zero out newly allocated blocks on error */ | ||
2755 | return err; | ||
2756 | } | ||
2757 | |||
2758 | /* | ||
2759 | * a helper for releasepage, this tests for areas of the page that | ||
2760 | * are locked or under IO and drops the related state bits if it is safe | ||
2761 | * to drop the page. | ||
2762 | */ | ||
2763 | int try_release_extent_state(struct extent_map_tree *map, | ||
2764 | struct extent_io_tree *tree, struct page *page, | ||
2765 | gfp_t mask) | ||
2766 | { | ||
2767 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
2768 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
2769 | int ret = 1; | ||
2770 | |||
2771 | if (test_range_bit(tree, start, end, | ||
2772 | EXTENT_IOBITS | EXTENT_ORDERED, 0)) | ||
2773 | ret = 0; | ||
2774 | else { | ||
2775 | if ((mask & GFP_NOFS) == GFP_NOFS) | ||
2776 | mask = GFP_NOFS; | ||
2777 | clear_extent_bit(tree, start, end, EXTENT_UPTODATE, | ||
2778 | 1, 1, mask); | ||
2779 | } | ||
2780 | return ret; | ||
2781 | } | ||
2782 | |||
2783 | /* | ||
2784 | * a helper for releasepage. As long as there are no locked extents | ||
2785 | * in the range corresponding to the page, both state records and extent | ||
2786 | * map records are removed | ||
2787 | */ | ||
2788 | int try_release_extent_mapping(struct extent_map_tree *map, | ||
2789 | struct extent_io_tree *tree, struct page *page, | ||
2790 | gfp_t mask) | ||
2791 | { | ||
2792 | struct extent_map *em; | ||
2793 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
2794 | u64 end = start + PAGE_CACHE_SIZE - 1; | ||
2795 | |||
2796 | if ((mask & __GFP_WAIT) && | ||
2797 | page->mapping->host->i_size > 16 * 1024 * 1024) { | ||
2798 | u64 len; | ||
2799 | while (start <= end) { | ||
2800 | len = end - start + 1; | ||
2801 | spin_lock(&map->lock); | ||
2802 | em = lookup_extent_mapping(map, start, len); | ||
2803 | if (!em || IS_ERR(em)) { | ||
2804 | spin_unlock(&map->lock); | ||
2805 | break; | ||
2806 | } | ||
2807 | if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || | ||
2808 | em->start != start) { | ||
2809 | spin_unlock(&map->lock); | ||
2810 | free_extent_map(em); | ||
2811 | break; | ||
2812 | } | ||
2813 | if (!test_range_bit(tree, em->start, | ||
2814 | extent_map_end(em) - 1, | ||
2815 | EXTENT_LOCKED | EXTENT_WRITEBACK | | ||
2816 | EXTENT_ORDERED, | ||
2817 | 0)) { | ||
2818 | remove_extent_mapping(map, em); | ||
2819 | /* once for the rb tree */ | ||
2820 | free_extent_map(em); | ||
2821 | } | ||
2822 | start = extent_map_end(em); | ||
2823 | spin_unlock(&map->lock); | ||
2824 | |||
2825 | /* once for us */ | ||
2826 | free_extent_map(em); | ||
2827 | } | ||
2828 | } | ||
2829 | return try_release_extent_state(map, tree, page, mask); | ||
2830 | } | ||
2831 | |||
2832 | sector_t extent_bmap(struct address_space *mapping, sector_t iblock, | ||
2833 | get_extent_t *get_extent) | ||
2834 | { | ||
2835 | struct inode *inode = mapping->host; | ||
2836 | u64 start = iblock << inode->i_blkbits; | ||
2837 | sector_t sector = 0; | ||
2838 | size_t blksize = (1 << inode->i_blkbits); | ||
2839 | struct extent_map *em; | ||
2840 | |||
2841 | lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, | ||
2842 | GFP_NOFS); | ||
2843 | em = get_extent(inode, NULL, 0, start, blksize, 0); | ||
2844 | unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, | ||
2845 | GFP_NOFS); | ||
2846 | if (!em || IS_ERR(em)) | ||
2847 | return 0; | ||
2848 | |||
2849 | if (em->block_start > EXTENT_MAP_LAST_BYTE) | ||
2850 | goto out; | ||
2851 | |||
2852 | sector = (em->block_start + start - em->start) >> inode->i_blkbits; | ||
2853 | out: | ||
2854 | free_extent_map(em); | ||
2855 | return sector; | ||
2856 | } | ||
2857 | |||
2858 | static inline struct page *extent_buffer_page(struct extent_buffer *eb, | ||
2859 | unsigned long i) | ||
2860 | { | ||
2861 | struct page *p; | ||
2862 | struct address_space *mapping; | ||
2863 | |||
2864 | if (i == 0) | ||
2865 | return eb->first_page; | ||
2866 | i += eb->start >> PAGE_CACHE_SHIFT; | ||
2867 | mapping = eb->first_page->mapping; | ||
2868 | if (!mapping) | ||
2869 | return NULL; | ||
2870 | |||
2871 | /* | ||
2872 | * extent_buffer_page is only called after pinning the page | ||
2873 | * by increasing the reference count. So we know the page must | ||
2874 | * be in the radix tree. | ||
2875 | */ | ||
2876 | rcu_read_lock(); | ||
2877 | p = radix_tree_lookup(&mapping->page_tree, i); | ||
2878 | rcu_read_unlock(); | ||
2879 | |||
2880 | return p; | ||
2881 | } | ||
2882 | |||
2883 | static inline unsigned long num_extent_pages(u64 start, u64 len) | ||
2884 | { | ||
2885 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - | ||
2886 | (start >> PAGE_CACHE_SHIFT); | ||
2887 | } | ||
2888 | |||
2889 | static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | ||
2890 | u64 start, | ||
2891 | unsigned long len, | ||
2892 | gfp_t mask) | ||
2893 | { | ||
2894 | struct extent_buffer *eb = NULL; | ||
2895 | #ifdef LEAK_DEBUG | ||
2896 | unsigned long flags; | ||
2897 | #endif | ||
2898 | |||
2899 | eb = kmem_cache_zalloc(extent_buffer_cache, mask); | ||
2900 | eb->start = start; | ||
2901 | eb->len = len; | ||
2902 | mutex_init(&eb->mutex); | ||
2903 | #ifdef LEAK_DEBUG | ||
2904 | spin_lock_irqsave(&leak_lock, flags); | ||
2905 | list_add(&eb->leak_list, &buffers); | ||
2906 | spin_unlock_irqrestore(&leak_lock, flags); | ||
2907 | #endif | ||
2908 | atomic_set(&eb->refs, 1); | ||
2909 | |||
2910 | return eb; | ||
2911 | } | ||
2912 | |||
2913 | static void __free_extent_buffer(struct extent_buffer *eb) | ||
2914 | { | ||
2915 | #ifdef LEAK_DEBUG | ||
2916 | unsigned long flags; | ||
2917 | spin_lock_irqsave(&leak_lock, flags); | ||
2918 | list_del(&eb->leak_list); | ||
2919 | spin_unlock_irqrestore(&leak_lock, flags); | ||
2920 | #endif | ||
2921 | kmem_cache_free(extent_buffer_cache, eb); | ||
2922 | } | ||
2923 | |||
2924 | struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | ||
2925 | u64 start, unsigned long len, | ||
2926 | struct page *page0, | ||
2927 | gfp_t mask) | ||
2928 | { | ||
2929 | unsigned long num_pages = num_extent_pages(start, len); | ||
2930 | unsigned long i; | ||
2931 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
2932 | struct extent_buffer *eb; | ||
2933 | struct extent_buffer *exists = NULL; | ||
2934 | struct page *p; | ||
2935 | struct address_space *mapping = tree->mapping; | ||
2936 | int uptodate = 1; | ||
2937 | |||
2938 | spin_lock(&tree->buffer_lock); | ||
2939 | eb = buffer_search(tree, start); | ||
2940 | if (eb) { | ||
2941 | atomic_inc(&eb->refs); | ||
2942 | spin_unlock(&tree->buffer_lock); | ||
2943 | mark_page_accessed(eb->first_page); | ||
2944 | return eb; | ||
2945 | } | ||
2946 | spin_unlock(&tree->buffer_lock); | ||
2947 | |||
2948 | eb = __alloc_extent_buffer(tree, start, len, mask); | ||
2949 | if (!eb) | ||
2950 | return NULL; | ||
2951 | |||
2952 | if (page0) { | ||
2953 | eb->first_page = page0; | ||
2954 | i = 1; | ||
2955 | index++; | ||
2956 | page_cache_get(page0); | ||
2957 | mark_page_accessed(page0); | ||
2958 | set_page_extent_mapped(page0); | ||
2959 | set_page_extent_head(page0, len); | ||
2960 | uptodate = PageUptodate(page0); | ||
2961 | } else { | ||
2962 | i = 0; | ||
2963 | } | ||
2964 | for (; i < num_pages; i++, index++) { | ||
2965 | p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); | ||
2966 | if (!p) { | ||
2967 | WARN_ON(1); | ||
2968 | goto free_eb; | ||
2969 | } | ||
2970 | set_page_extent_mapped(p); | ||
2971 | mark_page_accessed(p); | ||
2972 | if (i == 0) { | ||
2973 | eb->first_page = p; | ||
2974 | set_page_extent_head(p, len); | ||
2975 | } else { | ||
2976 | set_page_private(p, EXTENT_PAGE_PRIVATE); | ||
2977 | } | ||
2978 | if (!PageUptodate(p)) | ||
2979 | uptodate = 0; | ||
2980 | unlock_page(p); | ||
2981 | } | ||
2982 | if (uptodate) | ||
2983 | eb->flags |= EXTENT_UPTODATE; | ||
2984 | eb->flags |= EXTENT_BUFFER_FILLED; | ||
2985 | |||
2986 | spin_lock(&tree->buffer_lock); | ||
2987 | exists = buffer_tree_insert(tree, start, &eb->rb_node); | ||
2988 | if (exists) { | ||
2989 | /* add one reference for the caller */ | ||
2990 | atomic_inc(&exists->refs); | ||
2991 | spin_unlock(&tree->buffer_lock); | ||
2992 | goto free_eb; | ||
2993 | } | ||
2994 | spin_unlock(&tree->buffer_lock); | ||
2995 | |||
2996 | /* add one reference for the tree */ | ||
2997 | atomic_inc(&eb->refs); | ||
2998 | return eb; | ||
2999 | |||
3000 | free_eb: | ||
3001 | if (!atomic_dec_and_test(&eb->refs)) | ||
3002 | return exists; | ||
3003 | for (index = 1; index < i; index++) | ||
3004 | page_cache_release(extent_buffer_page(eb, index)); | ||
3005 | page_cache_release(extent_buffer_page(eb, 0)); | ||
3006 | __free_extent_buffer(eb); | ||
3007 | return exists; | ||
3008 | } | ||
3009 | |||
3010 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, | ||
3011 | u64 start, unsigned long len, | ||
3012 | gfp_t mask) | ||
3013 | { | ||
3014 | struct extent_buffer *eb; | ||
3015 | |||
3016 | spin_lock(&tree->buffer_lock); | ||
3017 | eb = buffer_search(tree, start); | ||
3018 | if (eb) | ||
3019 | atomic_inc(&eb->refs); | ||
3020 | spin_unlock(&tree->buffer_lock); | ||
3021 | |||
3022 | if (eb) | ||
3023 | mark_page_accessed(eb->first_page); | ||
3024 | |||
3025 | return eb; | ||
3026 | } | ||
3027 | |||
3028 | void free_extent_buffer(struct extent_buffer *eb) | ||
3029 | { | ||
3030 | if (!eb) | ||
3031 | return; | ||
3032 | |||
3033 | if (!atomic_dec_and_test(&eb->refs)) | ||
3034 | return; | ||
3035 | |||
3036 | WARN_ON(1); | ||
3037 | } | ||
3038 | |||
3039 | int clear_extent_buffer_dirty(struct extent_io_tree *tree, | ||
3040 | struct extent_buffer *eb) | ||
3041 | { | ||
3042 | int set; | ||
3043 | unsigned long i; | ||
3044 | unsigned long num_pages; | ||
3045 | struct page *page; | ||
3046 | |||
3047 | u64 start = eb->start; | ||
3048 | u64 end = start + eb->len - 1; | ||
3049 | |||
3050 | set = clear_extent_dirty(tree, start, end, GFP_NOFS); | ||
3051 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3052 | |||
3053 | for (i = 0; i < num_pages; i++) { | ||
3054 | page = extent_buffer_page(eb, i); | ||
3055 | if (!set && !PageDirty(page)) | ||
3056 | continue; | ||
3057 | |||
3058 | lock_page(page); | ||
3059 | if (i == 0) | ||
3060 | set_page_extent_head(page, eb->len); | ||
3061 | else | ||
3062 | set_page_private(page, EXTENT_PAGE_PRIVATE); | ||
3063 | |||
3064 | /* | ||
3065 | * if we're on the last page or the first page and the | ||
3066 | * block isn't aligned on a page boundary, do extra checks | ||
3067 | * to make sure we don't clean page that is partially dirty | ||
3068 | */ | ||
3069 | if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || | ||
3070 | ((i == num_pages - 1) && | ||
3071 | ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { | ||
3072 | start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
3073 | end = start + PAGE_CACHE_SIZE - 1; | ||
3074 | if (test_range_bit(tree, start, end, | ||
3075 | EXTENT_DIRTY, 0)) { | ||
3076 | unlock_page(page); | ||
3077 | continue; | ||
3078 | } | ||
3079 | } | ||
3080 | clear_page_dirty_for_io(page); | ||
3081 | spin_lock_irq(&page->mapping->tree_lock); | ||
3082 | if (!PageDirty(page)) { | ||
3083 | radix_tree_tag_clear(&page->mapping->page_tree, | ||
3084 | page_index(page), | ||
3085 | PAGECACHE_TAG_DIRTY); | ||
3086 | } | ||
3087 | spin_unlock_irq(&page->mapping->tree_lock); | ||
3088 | unlock_page(page); | ||
3089 | } | ||
3090 | return 0; | ||
3091 | } | ||
3092 | |||
3093 | int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, | ||
3094 | struct extent_buffer *eb) | ||
3095 | { | ||
3096 | return wait_on_extent_writeback(tree, eb->start, | ||
3097 | eb->start + eb->len - 1); | ||
3098 | } | ||
3099 | |||
3100 | int set_extent_buffer_dirty(struct extent_io_tree *tree, | ||
3101 | struct extent_buffer *eb) | ||
3102 | { | ||
3103 | unsigned long i; | ||
3104 | unsigned long num_pages; | ||
3105 | |||
3106 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3107 | for (i = 0; i < num_pages; i++) { | ||
3108 | struct page *page = extent_buffer_page(eb, i); | ||
3109 | /* writepage may need to do something special for the | ||
3110 | * first page, we have to make sure page->private is | ||
3111 | * properly set. releasepage may drop page->private | ||
3112 | * on us if the page isn't already dirty. | ||
3113 | */ | ||
3114 | lock_page(page); | ||
3115 | if (i == 0) { | ||
3116 | set_page_extent_head(page, eb->len); | ||
3117 | } else if (PagePrivate(page) && | ||
3118 | page->private != EXTENT_PAGE_PRIVATE) { | ||
3119 | set_page_extent_mapped(page); | ||
3120 | } | ||
3121 | __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); | ||
3122 | set_extent_dirty(tree, page_offset(page), | ||
3123 | page_offset(page) + PAGE_CACHE_SIZE - 1, | ||
3124 | GFP_NOFS); | ||
3125 | unlock_page(page); | ||
3126 | } | ||
3127 | return 0; | ||
3128 | } | ||
3129 | |||
3130 | int clear_extent_buffer_uptodate(struct extent_io_tree *tree, | ||
3131 | struct extent_buffer *eb) | ||
3132 | { | ||
3133 | unsigned long i; | ||
3134 | struct page *page; | ||
3135 | unsigned long num_pages; | ||
3136 | |||
3137 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3138 | eb->flags &= ~EXTENT_UPTODATE; | ||
3139 | |||
3140 | clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, | ||
3141 | GFP_NOFS); | ||
3142 | for (i = 0; i < num_pages; i++) { | ||
3143 | page = extent_buffer_page(eb, i); | ||
3144 | if (page) | ||
3145 | ClearPageUptodate(page); | ||
3146 | } | ||
3147 | return 0; | ||
3148 | } | ||
3149 | |||
3150 | int set_extent_buffer_uptodate(struct extent_io_tree *tree, | ||
3151 | struct extent_buffer *eb) | ||
3152 | { | ||
3153 | unsigned long i; | ||
3154 | struct page *page; | ||
3155 | unsigned long num_pages; | ||
3156 | |||
3157 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3158 | |||
3159 | set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, | ||
3160 | GFP_NOFS); | ||
3161 | for (i = 0; i < num_pages; i++) { | ||
3162 | page = extent_buffer_page(eb, i); | ||
3163 | if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || | ||
3164 | ((i == num_pages - 1) && | ||
3165 | ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { | ||
3166 | check_page_uptodate(tree, page); | ||
3167 | continue; | ||
3168 | } | ||
3169 | SetPageUptodate(page); | ||
3170 | } | ||
3171 | return 0; | ||
3172 | } | ||
3173 | |||
3174 | int extent_range_uptodate(struct extent_io_tree *tree, | ||
3175 | u64 start, u64 end) | ||
3176 | { | ||
3177 | struct page *page; | ||
3178 | int ret; | ||
3179 | int pg_uptodate = 1; | ||
3180 | int uptodate; | ||
3181 | unsigned long index; | ||
3182 | |||
3183 | ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); | ||
3184 | if (ret) | ||
3185 | return 1; | ||
3186 | while (start <= end) { | ||
3187 | index = start >> PAGE_CACHE_SHIFT; | ||
3188 | page = find_get_page(tree->mapping, index); | ||
3189 | uptodate = PageUptodate(page); | ||
3190 | page_cache_release(page); | ||
3191 | if (!uptodate) { | ||
3192 | pg_uptodate = 0; | ||
3193 | break; | ||
3194 | } | ||
3195 | start += PAGE_CACHE_SIZE; | ||
3196 | } | ||
3197 | return pg_uptodate; | ||
3198 | } | ||
3199 | |||
3200 | int extent_buffer_uptodate(struct extent_io_tree *tree, | ||
3201 | struct extent_buffer *eb) | ||
3202 | { | ||
3203 | int ret = 0; | ||
3204 | unsigned long num_pages; | ||
3205 | unsigned long i; | ||
3206 | struct page *page; | ||
3207 | int pg_uptodate = 1; | ||
3208 | |||
3209 | if (eb->flags & EXTENT_UPTODATE) | ||
3210 | return 1; | ||
3211 | |||
3212 | ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, | ||
3213 | EXTENT_UPTODATE, 1); | ||
3214 | if (ret) | ||
3215 | return ret; | ||
3216 | |||
3217 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3218 | for (i = 0; i < num_pages; i++) { | ||
3219 | page = extent_buffer_page(eb, i); | ||
3220 | if (!PageUptodate(page)) { | ||
3221 | pg_uptodate = 0; | ||
3222 | break; | ||
3223 | } | ||
3224 | } | ||
3225 | return pg_uptodate; | ||
3226 | } | ||
3227 | |||
3228 | int read_extent_buffer_pages(struct extent_io_tree *tree, | ||
3229 | struct extent_buffer *eb, | ||
3230 | u64 start, int wait, | ||
3231 | get_extent_t *get_extent, int mirror_num) | ||
3232 | { | ||
3233 | unsigned long i; | ||
3234 | unsigned long start_i; | ||
3235 | struct page *page; | ||
3236 | int err; | ||
3237 | int ret = 0; | ||
3238 | int locked_pages = 0; | ||
3239 | int all_uptodate = 1; | ||
3240 | int inc_all_pages = 0; | ||
3241 | unsigned long num_pages; | ||
3242 | struct bio *bio = NULL; | ||
3243 | unsigned long bio_flags = 0; | ||
3244 | |||
3245 | if (eb->flags & EXTENT_UPTODATE) | ||
3246 | return 0; | ||
3247 | |||
3248 | if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, | ||
3249 | EXTENT_UPTODATE, 1)) { | ||
3250 | return 0; | ||
3251 | } | ||
3252 | |||
3253 | if (start) { | ||
3254 | WARN_ON(start < eb->start); | ||
3255 | start_i = (start >> PAGE_CACHE_SHIFT) - | ||
3256 | (eb->start >> PAGE_CACHE_SHIFT); | ||
3257 | } else { | ||
3258 | start_i = 0; | ||
3259 | } | ||
3260 | |||
3261 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3262 | for (i = start_i; i < num_pages; i++) { | ||
3263 | page = extent_buffer_page(eb, i); | ||
3264 | if (!wait) { | ||
3265 | if (!trylock_page(page)) | ||
3266 | goto unlock_exit; | ||
3267 | } else { | ||
3268 | lock_page(page); | ||
3269 | } | ||
3270 | locked_pages++; | ||
3271 | if (!PageUptodate(page)) | ||
3272 | all_uptodate = 0; | ||
3273 | } | ||
3274 | if (all_uptodate) { | ||
3275 | if (start_i == 0) | ||
3276 | eb->flags |= EXTENT_UPTODATE; | ||
3277 | goto unlock_exit; | ||
3278 | } | ||
3279 | |||
3280 | for (i = start_i; i < num_pages; i++) { | ||
3281 | page = extent_buffer_page(eb, i); | ||
3282 | if (inc_all_pages) | ||
3283 | page_cache_get(page); | ||
3284 | if (!PageUptodate(page)) { | ||
3285 | if (start_i == 0) | ||
3286 | inc_all_pages = 1; | ||
3287 | ClearPageError(page); | ||
3288 | err = __extent_read_full_page(tree, page, | ||
3289 | get_extent, &bio, | ||
3290 | mirror_num, &bio_flags); | ||
3291 | if (err) | ||
3292 | ret = err; | ||
3293 | } else { | ||
3294 | unlock_page(page); | ||
3295 | } | ||
3296 | } | ||
3297 | |||
3298 | if (bio) | ||
3299 | submit_one_bio(READ, bio, mirror_num, bio_flags); | ||
3300 | |||
3301 | if (ret || !wait) | ||
3302 | return ret; | ||
3303 | |||
3304 | for (i = start_i; i < num_pages; i++) { | ||
3305 | page = extent_buffer_page(eb, i); | ||
3306 | wait_on_page_locked(page); | ||
3307 | if (!PageUptodate(page)) | ||
3308 | ret = -EIO; | ||
3309 | } | ||
3310 | |||
3311 | if (!ret) | ||
3312 | eb->flags |= EXTENT_UPTODATE; | ||
3313 | return ret; | ||
3314 | |||
3315 | unlock_exit: | ||
3316 | i = start_i; | ||
3317 | while (locked_pages > 0) { | ||
3318 | page = extent_buffer_page(eb, i); | ||
3319 | i++; | ||
3320 | unlock_page(page); | ||
3321 | locked_pages--; | ||
3322 | } | ||
3323 | return ret; | ||
3324 | } | ||
3325 | |||
3326 | void read_extent_buffer(struct extent_buffer *eb, void *dstv, | ||
3327 | unsigned long start, | ||
3328 | unsigned long len) | ||
3329 | { | ||
3330 | size_t cur; | ||
3331 | size_t offset; | ||
3332 | struct page *page; | ||
3333 | char *kaddr; | ||
3334 | char *dst = (char *)dstv; | ||
3335 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3336 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | ||
3337 | |||
3338 | WARN_ON(start > eb->len); | ||
3339 | WARN_ON(start + len > eb->start + eb->len); | ||
3340 | |||
3341 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3342 | |||
3343 | while (len > 0) { | ||
3344 | page = extent_buffer_page(eb, i); | ||
3345 | |||
3346 | cur = min(len, (PAGE_CACHE_SIZE - offset)); | ||
3347 | kaddr = kmap_atomic(page, KM_USER1); | ||
3348 | memcpy(dst, kaddr + offset, cur); | ||
3349 | kunmap_atomic(kaddr, KM_USER1); | ||
3350 | |||
3351 | dst += cur; | ||
3352 | len -= cur; | ||
3353 | offset = 0; | ||
3354 | i++; | ||
3355 | } | ||
3356 | } | ||
3357 | |||
3358 | int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, | ||
3359 | unsigned long min_len, char **token, char **map, | ||
3360 | unsigned long *map_start, | ||
3361 | unsigned long *map_len, int km) | ||
3362 | { | ||
3363 | size_t offset = start & (PAGE_CACHE_SIZE - 1); | ||
3364 | char *kaddr; | ||
3365 | struct page *p; | ||
3366 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3367 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | ||
3368 | unsigned long end_i = (start_offset + start + min_len - 1) >> | ||
3369 | PAGE_CACHE_SHIFT; | ||
3370 | |||
3371 | if (i != end_i) | ||
3372 | return -EINVAL; | ||
3373 | |||
3374 | if (i == 0) { | ||
3375 | offset = start_offset; | ||
3376 | *map_start = 0; | ||
3377 | } else { | ||
3378 | offset = 0; | ||
3379 | *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; | ||
3380 | } | ||
3381 | |||
3382 | if (start + min_len > eb->len) { | ||
3383 | printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " | ||
3384 | "wanted %lu %lu\n", (unsigned long long)eb->start, | ||
3385 | eb->len, start, min_len); | ||
3386 | WARN_ON(1); | ||
3387 | } | ||
3388 | |||
3389 | p = extent_buffer_page(eb, i); | ||
3390 | kaddr = kmap_atomic(p, km); | ||
3391 | *token = kaddr; | ||
3392 | *map = kaddr + offset; | ||
3393 | *map_len = PAGE_CACHE_SIZE - offset; | ||
3394 | return 0; | ||
3395 | } | ||
3396 | |||
3397 | int map_extent_buffer(struct extent_buffer *eb, unsigned long start, | ||
3398 | unsigned long min_len, | ||
3399 | char **token, char **map, | ||
3400 | unsigned long *map_start, | ||
3401 | unsigned long *map_len, int km) | ||
3402 | { | ||
3403 | int err; | ||
3404 | int save = 0; | ||
3405 | if (eb->map_token) { | ||
3406 | unmap_extent_buffer(eb, eb->map_token, km); | ||
3407 | eb->map_token = NULL; | ||
3408 | save = 1; | ||
3409 | WARN_ON(!mutex_is_locked(&eb->mutex)); | ||
3410 | } | ||
3411 | err = map_private_extent_buffer(eb, start, min_len, token, map, | ||
3412 | map_start, map_len, km); | ||
3413 | if (!err && save) { | ||
3414 | eb->map_token = *token; | ||
3415 | eb->kaddr = *map; | ||
3416 | eb->map_start = *map_start; | ||
3417 | eb->map_len = *map_len; | ||
3418 | } | ||
3419 | return err; | ||
3420 | } | ||
3421 | |||
3422 | void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) | ||
3423 | { | ||
3424 | kunmap_atomic(token, km); | ||
3425 | } | ||
3426 | |||
3427 | int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, | ||
3428 | unsigned long start, | ||
3429 | unsigned long len) | ||
3430 | { | ||
3431 | size_t cur; | ||
3432 | size_t offset; | ||
3433 | struct page *page; | ||
3434 | char *kaddr; | ||
3435 | char *ptr = (char *)ptrv; | ||
3436 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3437 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | ||
3438 | int ret = 0; | ||
3439 | |||
3440 | WARN_ON(start > eb->len); | ||
3441 | WARN_ON(start + len > eb->start + eb->len); | ||
3442 | |||
3443 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3444 | |||
3445 | while (len > 0) { | ||
3446 | page = extent_buffer_page(eb, i); | ||
3447 | |||
3448 | cur = min(len, (PAGE_CACHE_SIZE - offset)); | ||
3449 | |||
3450 | kaddr = kmap_atomic(page, KM_USER0); | ||
3451 | ret = memcmp(ptr, kaddr + offset, cur); | ||
3452 | kunmap_atomic(kaddr, KM_USER0); | ||
3453 | if (ret) | ||
3454 | break; | ||
3455 | |||
3456 | ptr += cur; | ||
3457 | len -= cur; | ||
3458 | offset = 0; | ||
3459 | i++; | ||
3460 | } | ||
3461 | return ret; | ||
3462 | } | ||
3463 | |||
3464 | void write_extent_buffer(struct extent_buffer *eb, const void *srcv, | ||
3465 | unsigned long start, unsigned long len) | ||
3466 | { | ||
3467 | size_t cur; | ||
3468 | size_t offset; | ||
3469 | struct page *page; | ||
3470 | char *kaddr; | ||
3471 | char *src = (char *)srcv; | ||
3472 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3473 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | ||
3474 | |||
3475 | WARN_ON(start > eb->len); | ||
3476 | WARN_ON(start + len > eb->start + eb->len); | ||
3477 | |||
3478 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3479 | |||
3480 | while (len > 0) { | ||
3481 | page = extent_buffer_page(eb, i); | ||
3482 | WARN_ON(!PageUptodate(page)); | ||
3483 | |||
3484 | cur = min(len, PAGE_CACHE_SIZE - offset); | ||
3485 | kaddr = kmap_atomic(page, KM_USER1); | ||
3486 | memcpy(kaddr + offset, src, cur); | ||
3487 | kunmap_atomic(kaddr, KM_USER1); | ||
3488 | |||
3489 | src += cur; | ||
3490 | len -= cur; | ||
3491 | offset = 0; | ||
3492 | i++; | ||
3493 | } | ||
3494 | } | ||
3495 | |||
3496 | void memset_extent_buffer(struct extent_buffer *eb, char c, | ||
3497 | unsigned long start, unsigned long len) | ||
3498 | { | ||
3499 | size_t cur; | ||
3500 | size_t offset; | ||
3501 | struct page *page; | ||
3502 | char *kaddr; | ||
3503 | size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3504 | unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; | ||
3505 | |||
3506 | WARN_ON(start > eb->len); | ||
3507 | WARN_ON(start + len > eb->start + eb->len); | ||
3508 | |||
3509 | offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3510 | |||
3511 | while (len > 0) { | ||
3512 | page = extent_buffer_page(eb, i); | ||
3513 | WARN_ON(!PageUptodate(page)); | ||
3514 | |||
3515 | cur = min(len, PAGE_CACHE_SIZE - offset); | ||
3516 | kaddr = kmap_atomic(page, KM_USER0); | ||
3517 | memset(kaddr + offset, c, cur); | ||
3518 | kunmap_atomic(kaddr, KM_USER0); | ||
3519 | |||
3520 | len -= cur; | ||
3521 | offset = 0; | ||
3522 | i++; | ||
3523 | } | ||
3524 | } | ||
3525 | |||
3526 | void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, | ||
3527 | unsigned long dst_offset, unsigned long src_offset, | ||
3528 | unsigned long len) | ||
3529 | { | ||
3530 | u64 dst_len = dst->len; | ||
3531 | size_t cur; | ||
3532 | size_t offset; | ||
3533 | struct page *page; | ||
3534 | char *kaddr; | ||
3535 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3536 | unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; | ||
3537 | |||
3538 | WARN_ON(src->len != dst_len); | ||
3539 | |||
3540 | offset = (start_offset + dst_offset) & | ||
3541 | ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3542 | |||
3543 | while (len > 0) { | ||
3544 | page = extent_buffer_page(dst, i); | ||
3545 | WARN_ON(!PageUptodate(page)); | ||
3546 | |||
3547 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); | ||
3548 | |||
3549 | kaddr = kmap_atomic(page, KM_USER0); | ||
3550 | read_extent_buffer(src, kaddr + offset, src_offset, cur); | ||
3551 | kunmap_atomic(kaddr, KM_USER0); | ||
3552 | |||
3553 | src_offset += cur; | ||
3554 | len -= cur; | ||
3555 | offset = 0; | ||
3556 | i++; | ||
3557 | } | ||
3558 | } | ||
3559 | |||
3560 | static void move_pages(struct page *dst_page, struct page *src_page, | ||
3561 | unsigned long dst_off, unsigned long src_off, | ||
3562 | unsigned long len) | ||
3563 | { | ||
3564 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); | ||
3565 | if (dst_page == src_page) { | ||
3566 | memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); | ||
3567 | } else { | ||
3568 | char *src_kaddr = kmap_atomic(src_page, KM_USER1); | ||
3569 | char *p = dst_kaddr + dst_off + len; | ||
3570 | char *s = src_kaddr + src_off + len; | ||
3571 | |||
3572 | while (len--) | ||
3573 | *--p = *--s; | ||
3574 | |||
3575 | kunmap_atomic(src_kaddr, KM_USER1); | ||
3576 | } | ||
3577 | kunmap_atomic(dst_kaddr, KM_USER0); | ||
3578 | } | ||
3579 | |||
3580 | static void copy_pages(struct page *dst_page, struct page *src_page, | ||
3581 | unsigned long dst_off, unsigned long src_off, | ||
3582 | unsigned long len) | ||
3583 | { | ||
3584 | char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); | ||
3585 | char *src_kaddr; | ||
3586 | |||
3587 | if (dst_page != src_page) | ||
3588 | src_kaddr = kmap_atomic(src_page, KM_USER1); | ||
3589 | else | ||
3590 | src_kaddr = dst_kaddr; | ||
3591 | |||
3592 | memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); | ||
3593 | kunmap_atomic(dst_kaddr, KM_USER0); | ||
3594 | if (dst_page != src_page) | ||
3595 | kunmap_atomic(src_kaddr, KM_USER1); | ||
3596 | } | ||
3597 | |||
3598 | void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | ||
3599 | unsigned long src_offset, unsigned long len) | ||
3600 | { | ||
3601 | size_t cur; | ||
3602 | size_t dst_off_in_page; | ||
3603 | size_t src_off_in_page; | ||
3604 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3605 | unsigned long dst_i; | ||
3606 | unsigned long src_i; | ||
3607 | |||
3608 | if (src_offset + len > dst->len) { | ||
3609 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " | ||
3610 | "len %lu dst len %lu\n", src_offset, len, dst->len); | ||
3611 | BUG_ON(1); | ||
3612 | } | ||
3613 | if (dst_offset + len > dst->len) { | ||
3614 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " | ||
3615 | "len %lu dst len %lu\n", dst_offset, len, dst->len); | ||
3616 | BUG_ON(1); | ||
3617 | } | ||
3618 | |||
3619 | while (len > 0) { | ||
3620 | dst_off_in_page = (start_offset + dst_offset) & | ||
3621 | ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3622 | src_off_in_page = (start_offset + src_offset) & | ||
3623 | ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3624 | |||
3625 | dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; | ||
3626 | src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; | ||
3627 | |||
3628 | cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - | ||
3629 | src_off_in_page)); | ||
3630 | cur = min_t(unsigned long, cur, | ||
3631 | (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); | ||
3632 | |||
3633 | copy_pages(extent_buffer_page(dst, dst_i), | ||
3634 | extent_buffer_page(dst, src_i), | ||
3635 | dst_off_in_page, src_off_in_page, cur); | ||
3636 | |||
3637 | src_offset += cur; | ||
3638 | dst_offset += cur; | ||
3639 | len -= cur; | ||
3640 | } | ||
3641 | } | ||
3642 | |||
3643 | void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | ||
3644 | unsigned long src_offset, unsigned long len) | ||
3645 | { | ||
3646 | size_t cur; | ||
3647 | size_t dst_off_in_page; | ||
3648 | size_t src_off_in_page; | ||
3649 | unsigned long dst_end = dst_offset + len - 1; | ||
3650 | unsigned long src_end = src_offset + len - 1; | ||
3651 | size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); | ||
3652 | unsigned long dst_i; | ||
3653 | unsigned long src_i; | ||
3654 | |||
3655 | if (src_offset + len > dst->len) { | ||
3656 | printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " | ||
3657 | "len %lu len %lu\n", src_offset, len, dst->len); | ||
3658 | BUG_ON(1); | ||
3659 | } | ||
3660 | if (dst_offset + len > dst->len) { | ||
3661 | printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " | ||
3662 | "len %lu len %lu\n", dst_offset, len, dst->len); | ||
3663 | BUG_ON(1); | ||
3664 | } | ||
3665 | if (dst_offset < src_offset) { | ||
3666 | memcpy_extent_buffer(dst, dst_offset, src_offset, len); | ||
3667 | return; | ||
3668 | } | ||
3669 | while (len > 0) { | ||
3670 | dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; | ||
3671 | src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; | ||
3672 | |||
3673 | dst_off_in_page = (start_offset + dst_end) & | ||
3674 | ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3675 | src_off_in_page = (start_offset + src_end) & | ||
3676 | ((unsigned long)PAGE_CACHE_SIZE - 1); | ||
3677 | |||
3678 | cur = min_t(unsigned long, len, src_off_in_page + 1); | ||
3679 | cur = min(cur, dst_off_in_page + 1); | ||
3680 | move_pages(extent_buffer_page(dst, dst_i), | ||
3681 | extent_buffer_page(dst, src_i), | ||
3682 | dst_off_in_page - cur + 1, | ||
3683 | src_off_in_page - cur + 1, cur); | ||
3684 | |||
3685 | dst_end -= cur; | ||
3686 | src_end -= cur; | ||
3687 | len -= cur; | ||
3688 | } | ||
3689 | } | ||
3690 | |||
3691 | int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) | ||
3692 | { | ||
3693 | u64 start = page_offset(page); | ||
3694 | struct extent_buffer *eb; | ||
3695 | int ret = 1; | ||
3696 | unsigned long i; | ||
3697 | unsigned long num_pages; | ||
3698 | |||
3699 | spin_lock(&tree->buffer_lock); | ||
3700 | eb = buffer_search(tree, start); | ||
3701 | if (!eb) | ||
3702 | goto out; | ||
3703 | |||
3704 | if (atomic_read(&eb->refs) > 1) { | ||
3705 | ret = 0; | ||
3706 | goto out; | ||
3707 | } | ||
3708 | /* at this point we can safely release the extent buffer */ | ||
3709 | num_pages = num_extent_pages(eb->start, eb->len); | ||
3710 | for (i = 0; i < num_pages; i++) | ||
3711 | page_cache_release(extent_buffer_page(eb, i)); | ||
3712 | rb_erase(&eb->rb_node, &tree->buffer); | ||
3713 | __free_extent_buffer(eb); | ||
3714 | out: | ||
3715 | spin_unlock(&tree->buffer_lock); | ||
3716 | return ret; | ||
3717 | } | ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 000000000000..c5b483a79137 --- /dev/null +++ b/fs/btrfs/extent_io.h | |||
@@ -0,0 +1,269 @@ | |||
1 | #ifndef __EXTENTIO__ | ||
2 | #define __EXTENTIO__ | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | |||
6 | /* bits for the extent state */ | ||
7 | #define EXTENT_DIRTY 1 | ||
8 | #define EXTENT_WRITEBACK (1 << 1) | ||
9 | #define EXTENT_UPTODATE (1 << 2) | ||
10 | #define EXTENT_LOCKED (1 << 3) | ||
11 | #define EXTENT_NEW (1 << 4) | ||
12 | #define EXTENT_DELALLOC (1 << 5) | ||
13 | #define EXTENT_DEFRAG (1 << 6) | ||
14 | #define EXTENT_DEFRAG_DONE (1 << 7) | ||
15 | #define EXTENT_BUFFER_FILLED (1 << 8) | ||
16 | #define EXTENT_ORDERED (1 << 9) | ||
17 | #define EXTENT_ORDERED_METADATA (1 << 10) | ||
18 | #define EXTENT_BOUNDARY (1 << 11) | ||
19 | #define EXTENT_NODATASUM (1 << 12) | ||
20 | #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) | ||
21 | |||
22 | /* flags for bio submission */ | ||
23 | #define EXTENT_BIO_COMPRESSED 1 | ||
24 | |||
25 | /* | ||
26 | * page->private values. Every page that is controlled by the extent | ||
27 | * map has page->private set to one. | ||
28 | */ | ||
29 | #define EXTENT_PAGE_PRIVATE 1 | ||
30 | #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 | ||
31 | |||
32 | struct extent_state; | ||
33 | |||
34 | typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, | ||
35 | struct bio *bio, int mirror_num, | ||
36 | unsigned long bio_flags); | ||
37 | struct extent_io_ops { | ||
38 | int (*fill_delalloc)(struct inode *inode, struct page *locked_page, | ||
39 | u64 start, u64 end, int *page_started, | ||
40 | unsigned long *nr_written); | ||
41 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); | ||
42 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); | ||
43 | extent_submit_bio_hook_t *submit_bio_hook; | ||
44 | int (*merge_bio_hook)(struct page *page, unsigned long offset, | ||
45 | size_t size, struct bio *bio, | ||
46 | unsigned long bio_flags); | ||
47 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); | ||
48 | int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, | ||
49 | u64 start, u64 end, | ||
50 | struct extent_state *state); | ||
51 | int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, | ||
52 | u64 start, u64 end, | ||
53 | struct extent_state *state); | ||
54 | int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, | ||
55 | struct extent_state *state); | ||
56 | int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, | ||
57 | struct extent_state *state, int uptodate); | ||
58 | int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, | ||
59 | unsigned long old, unsigned long bits); | ||
60 | int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, | ||
61 | unsigned long old, unsigned long bits); | ||
62 | int (*write_cache_pages_lock_hook)(struct page *page); | ||
63 | }; | ||
64 | |||
65 | struct extent_io_tree { | ||
66 | struct rb_root state; | ||
67 | struct rb_root buffer; | ||
68 | struct address_space *mapping; | ||
69 | u64 dirty_bytes; | ||
70 | spinlock_t lock; | ||
71 | spinlock_t buffer_lock; | ||
72 | struct extent_io_ops *ops; | ||
73 | }; | ||
74 | |||
75 | struct extent_state { | ||
76 | u64 start; | ||
77 | u64 end; /* inclusive */ | ||
78 | struct rb_node rb_node; | ||
79 | struct extent_io_tree *tree; | ||
80 | wait_queue_head_t wq; | ||
81 | atomic_t refs; | ||
82 | unsigned long state; | ||
83 | |||
84 | /* for use by the FS */ | ||
85 | u64 private; | ||
86 | |||
87 | struct list_head leak_list; | ||
88 | }; | ||
89 | |||
90 | struct extent_buffer { | ||
91 | u64 start; | ||
92 | unsigned long len; | ||
93 | char *map_token; | ||
94 | char *kaddr; | ||
95 | unsigned long map_start; | ||
96 | unsigned long map_len; | ||
97 | struct page *first_page; | ||
98 | atomic_t refs; | ||
99 | int flags; | ||
100 | struct list_head leak_list; | ||
101 | struct rb_node rb_node; | ||
102 | struct mutex mutex; | ||
103 | }; | ||
104 | |||
105 | struct extent_map_tree; | ||
106 | |||
107 | static inline struct extent_state *extent_state_next(struct extent_state *state) | ||
108 | { | ||
109 | struct rb_node *node; | ||
110 | node = rb_next(&state->rb_node); | ||
111 | if (!node) | ||
112 | return NULL; | ||
113 | return rb_entry(node, struct extent_state, rb_node); | ||
114 | } | ||
115 | |||
116 | typedef struct extent_map *(get_extent_t)(struct inode *inode, | ||
117 | struct page *page, | ||
118 | size_t page_offset, | ||
119 | u64 start, u64 len, | ||
120 | int create); | ||
121 | |||
122 | void extent_io_tree_init(struct extent_io_tree *tree, | ||
123 | struct address_space *mapping, gfp_t mask); | ||
124 | int try_release_extent_mapping(struct extent_map_tree *map, | ||
125 | struct extent_io_tree *tree, struct page *page, | ||
126 | gfp_t mask); | ||
127 | int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); | ||
128 | int try_release_extent_state(struct extent_map_tree *map, | ||
129 | struct extent_io_tree *tree, struct page *page, | ||
130 | gfp_t mask); | ||
131 | int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); | ||
132 | int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); | ||
133 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, | ||
134 | gfp_t mask); | ||
135 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | ||
136 | get_extent_t *get_extent); | ||
137 | int __init extent_io_init(void); | ||
138 | void extent_io_exit(void); | ||
139 | |||
140 | u64 count_range_bits(struct extent_io_tree *tree, | ||
141 | u64 *start, u64 search_end, | ||
142 | u64 max_bytes, unsigned long bits); | ||
143 | |||
144 | int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
145 | int bits, int filled); | ||
146 | int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | ||
147 | int bits, gfp_t mask); | ||
148 | int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
149 | int bits, int wake, int delete, gfp_t mask); | ||
150 | int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, | ||
151 | int bits, gfp_t mask); | ||
152 | int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, | ||
153 | gfp_t mask); | ||
154 | int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, | ||
155 | gfp_t mask); | ||
156 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | ||
157 | gfp_t mask); | ||
158 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | ||
159 | gfp_t mask); | ||
160 | int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, | ||
161 | gfp_t mask); | ||
162 | int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, | ||
163 | u64 end, gfp_t mask); | ||
164 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | ||
165 | gfp_t mask); | ||
166 | int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, | ||
167 | gfp_t mask); | ||
168 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | ||
169 | u64 *start_ret, u64 *end_ret, int bits); | ||
170 | struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, | ||
171 | u64 start, int bits); | ||
172 | int extent_invalidatepage(struct extent_io_tree *tree, | ||
173 | struct page *page, unsigned long offset); | ||
174 | int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | ||
175 | get_extent_t *get_extent, | ||
176 | struct writeback_control *wbc); | ||
177 | int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, | ||
178 | u64 start, u64 end, get_extent_t *get_extent, | ||
179 | int mode); | ||
180 | int extent_writepages(struct extent_io_tree *tree, | ||
181 | struct address_space *mapping, | ||
182 | get_extent_t *get_extent, | ||
183 | struct writeback_control *wbc); | ||
184 | int extent_readpages(struct extent_io_tree *tree, | ||
185 | struct address_space *mapping, | ||
186 | struct list_head *pages, unsigned nr_pages, | ||
187 | get_extent_t get_extent); | ||
188 | int extent_prepare_write(struct extent_io_tree *tree, | ||
189 | struct inode *inode, struct page *page, | ||
190 | unsigned from, unsigned to, get_extent_t *get_extent); | ||
191 | int extent_commit_write(struct extent_io_tree *tree, | ||
192 | struct inode *inode, struct page *page, | ||
193 | unsigned from, unsigned to); | ||
194 | sector_t extent_bmap(struct address_space *mapping, sector_t iblock, | ||
195 | get_extent_t *get_extent); | ||
196 | int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); | ||
197 | int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); | ||
198 | int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); | ||
199 | void set_page_extent_mapped(struct page *page); | ||
200 | |||
201 | struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | ||
202 | u64 start, unsigned long len, | ||
203 | struct page *page0, | ||
204 | gfp_t mask); | ||
205 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, | ||
206 | u64 start, unsigned long len, | ||
207 | gfp_t mask); | ||
208 | void free_extent_buffer(struct extent_buffer *eb); | ||
209 | int read_extent_buffer_pages(struct extent_io_tree *tree, | ||
210 | struct extent_buffer *eb, u64 start, int wait, | ||
211 | get_extent_t *get_extent, int mirror_num); | ||
212 | |||
213 | static inline void extent_buffer_get(struct extent_buffer *eb) | ||
214 | { | ||
215 | atomic_inc(&eb->refs); | ||
216 | } | ||
217 | |||
218 | int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, | ||
219 | unsigned long start, | ||
220 | unsigned long len); | ||
221 | void read_extent_buffer(struct extent_buffer *eb, void *dst, | ||
222 | unsigned long start, | ||
223 | unsigned long len); | ||
224 | void write_extent_buffer(struct extent_buffer *eb, const void *src, | ||
225 | unsigned long start, unsigned long len); | ||
226 | void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, | ||
227 | unsigned long dst_offset, unsigned long src_offset, | ||
228 | unsigned long len); | ||
229 | void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | ||
230 | unsigned long src_offset, unsigned long len); | ||
231 | void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, | ||
232 | unsigned long src_offset, unsigned long len); | ||
233 | void memset_extent_buffer(struct extent_buffer *eb, char c, | ||
234 | unsigned long start, unsigned long len); | ||
235 | int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, | ||
236 | struct extent_buffer *eb); | ||
237 | int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); | ||
238 | int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); | ||
239 | int clear_extent_buffer_dirty(struct extent_io_tree *tree, | ||
240 | struct extent_buffer *eb); | ||
241 | int set_extent_buffer_dirty(struct extent_io_tree *tree, | ||
242 | struct extent_buffer *eb); | ||
243 | int set_extent_buffer_uptodate(struct extent_io_tree *tree, | ||
244 | struct extent_buffer *eb); | ||
245 | int clear_extent_buffer_uptodate(struct extent_io_tree *tree, | ||
246 | struct extent_buffer *eb); | ||
247 | int extent_buffer_uptodate(struct extent_io_tree *tree, | ||
248 | struct extent_buffer *eb); | ||
249 | int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, | ||
250 | unsigned long min_len, char **token, char **map, | ||
251 | unsigned long *map_start, | ||
252 | unsigned long *map_len, int km); | ||
253 | int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, | ||
254 | unsigned long min_len, char **token, char **map, | ||
255 | unsigned long *map_start, | ||
256 | unsigned long *map_len, int km); | ||
257 | void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); | ||
258 | int release_extent_buffer_tail_pages(struct extent_buffer *eb); | ||
259 | int extent_range_uptodate(struct extent_io_tree *tree, | ||
260 | u64 start, u64 end); | ||
261 | int extent_clear_unlock_delalloc(struct inode *inode, | ||
262 | struct extent_io_tree *tree, | ||
263 | u64 start, u64 end, struct page *locked_page, | ||
264 | int unlock_page, | ||
265 | int clear_unlock, | ||
266 | int clear_delalloc, int clear_dirty, | ||
267 | int set_writeback, | ||
268 | int end_writeback); | ||
269 | #endif | ||
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 000000000000..4a83e33ada32 --- /dev/null +++ b/fs/btrfs/extent_map.c | |||
@@ -0,0 +1,351 @@ | |||
1 | #include <linux/err.h> | ||
2 | #include <linux/gfp.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/version.h> | ||
7 | #include <linux/hardirq.h> | ||
8 | #include "extent_map.h" | ||
9 | |||
10 | /* temporary define until extent_map moves out of btrfs */ | ||
11 | struct kmem_cache *btrfs_cache_create(const char *name, size_t size, | ||
12 | unsigned long extra_flags, | ||
13 | void (*ctor)(void *, struct kmem_cache *, | ||
14 | unsigned long)); | ||
15 | |||
16 | static struct kmem_cache *extent_map_cache; | ||
17 | |||
18 | int __init extent_map_init(void) | ||
19 | { | ||
20 | extent_map_cache = btrfs_cache_create("extent_map", | ||
21 | sizeof(struct extent_map), 0, | ||
22 | NULL); | ||
23 | if (!extent_map_cache) | ||
24 | return -ENOMEM; | ||
25 | return 0; | ||
26 | } | ||
27 | |||
28 | void extent_map_exit(void) | ||
29 | { | ||
30 | if (extent_map_cache) | ||
31 | kmem_cache_destroy(extent_map_cache); | ||
32 | } | ||
33 | |||
34 | /** | ||
35 | * extent_map_tree_init - initialize extent map tree | ||
36 | * @tree: tree to initialize | ||
37 | * @mask: flags for memory allocations during tree operations | ||
38 | * | ||
39 | * Initialize the extent tree @tree. Should be called for each new inode | ||
40 | * or other user of the extent_map interface. | ||
41 | */ | ||
42 | void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) | ||
43 | { | ||
44 | tree->map.rb_node = NULL; | ||
45 | spin_lock_init(&tree->lock); | ||
46 | } | ||
47 | EXPORT_SYMBOL(extent_map_tree_init); | ||
48 | |||
49 | /** | ||
50 | * alloc_extent_map - allocate new extent map structure | ||
51 | * @mask: memory allocation flags | ||
52 | * | ||
53 | * Allocate a new extent_map structure. The new structure is | ||
54 | * returned with a reference count of one and needs to be | ||
55 | * freed using free_extent_map() | ||
56 | */ | ||
57 | struct extent_map *alloc_extent_map(gfp_t mask) | ||
58 | { | ||
59 | struct extent_map *em; | ||
60 | em = kmem_cache_alloc(extent_map_cache, mask); | ||
61 | if (!em || IS_ERR(em)) | ||
62 | return em; | ||
63 | em->in_tree = 0; | ||
64 | em->flags = 0; | ||
65 | atomic_set(&em->refs, 1); | ||
66 | return em; | ||
67 | } | ||
68 | EXPORT_SYMBOL(alloc_extent_map); | ||
69 | |||
70 | /** | ||
71 | * free_extent_map - drop reference count of an extent_map | ||
72 | * @em: extent map beeing releasead | ||
73 | * | ||
74 | * Drops the reference out on @em by one and free the structure | ||
75 | * if the reference count hits zero. | ||
76 | */ | ||
77 | void free_extent_map(struct extent_map *em) | ||
78 | { | ||
79 | if (!em) | ||
80 | return; | ||
81 | WARN_ON(atomic_read(&em->refs) == 0); | ||
82 | if (atomic_dec_and_test(&em->refs)) { | ||
83 | WARN_ON(em->in_tree); | ||
84 | kmem_cache_free(extent_map_cache, em); | ||
85 | } | ||
86 | } | ||
87 | EXPORT_SYMBOL(free_extent_map); | ||
88 | |||
89 | static struct rb_node *tree_insert(struct rb_root *root, u64 offset, | ||
90 | struct rb_node *node) | ||
91 | { | ||
92 | struct rb_node **p = &root->rb_node; | ||
93 | struct rb_node *parent = NULL; | ||
94 | struct extent_map *entry; | ||
95 | |||
96 | while (*p) { | ||
97 | parent = *p; | ||
98 | entry = rb_entry(parent, struct extent_map, rb_node); | ||
99 | |||
100 | WARN_ON(!entry->in_tree); | ||
101 | |||
102 | if (offset < entry->start) | ||
103 | p = &(*p)->rb_left; | ||
104 | else if (offset >= extent_map_end(entry)) | ||
105 | p = &(*p)->rb_right; | ||
106 | else | ||
107 | return parent; | ||
108 | } | ||
109 | |||
110 | entry = rb_entry(node, struct extent_map, rb_node); | ||
111 | entry->in_tree = 1; | ||
112 | rb_link_node(node, parent, p); | ||
113 | rb_insert_color(node, root); | ||
114 | return NULL; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * search through the tree for an extent_map with a given offset. If | ||
119 | * it can't be found, try to find some neighboring extents | ||
120 | */ | ||
121 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, | ||
122 | struct rb_node **prev_ret, | ||
123 | struct rb_node **next_ret) | ||
124 | { | ||
125 | struct rb_node *n = root->rb_node; | ||
126 | struct rb_node *prev = NULL; | ||
127 | struct rb_node *orig_prev = NULL; | ||
128 | struct extent_map *entry; | ||
129 | struct extent_map *prev_entry = NULL; | ||
130 | |||
131 | while (n) { | ||
132 | entry = rb_entry(n, struct extent_map, rb_node); | ||
133 | prev = n; | ||
134 | prev_entry = entry; | ||
135 | |||
136 | WARN_ON(!entry->in_tree); | ||
137 | |||
138 | if (offset < entry->start) | ||
139 | n = n->rb_left; | ||
140 | else if (offset >= extent_map_end(entry)) | ||
141 | n = n->rb_right; | ||
142 | else | ||
143 | return n; | ||
144 | } | ||
145 | |||
146 | if (prev_ret) { | ||
147 | orig_prev = prev; | ||
148 | while (prev && offset >= extent_map_end(prev_entry)) { | ||
149 | prev = rb_next(prev); | ||
150 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | ||
151 | } | ||
152 | *prev_ret = prev; | ||
153 | prev = orig_prev; | ||
154 | } | ||
155 | |||
156 | if (next_ret) { | ||
157 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | ||
158 | while (prev && offset < prev_entry->start) { | ||
159 | prev = rb_prev(prev); | ||
160 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | ||
161 | } | ||
162 | *next_ret = prev; | ||
163 | } | ||
164 | return NULL; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * look for an offset in the tree, and if it can't be found, return | ||
169 | * the first offset we can find smaller than 'offset'. | ||
170 | */ | ||
171 | static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) | ||
172 | { | ||
173 | struct rb_node *prev; | ||
174 | struct rb_node *ret; | ||
175 | ret = __tree_search(root, offset, &prev, NULL); | ||
176 | if (!ret) | ||
177 | return prev; | ||
178 | return ret; | ||
179 | } | ||
180 | |||
181 | /* check to see if two extent_map structs are adjacent and safe to merge */ | ||
182 | static int mergable_maps(struct extent_map *prev, struct extent_map *next) | ||
183 | { | ||
184 | if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) | ||
185 | return 0; | ||
186 | |||
187 | /* | ||
188 | * don't merge compressed extents, we need to know their | ||
189 | * actual size | ||
190 | */ | ||
191 | if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) | ||
192 | return 0; | ||
193 | |||
194 | if (extent_map_end(prev) == next->start && | ||
195 | prev->flags == next->flags && | ||
196 | prev->bdev == next->bdev && | ||
197 | ((next->block_start == EXTENT_MAP_HOLE && | ||
198 | prev->block_start == EXTENT_MAP_HOLE) || | ||
199 | (next->block_start == EXTENT_MAP_INLINE && | ||
200 | prev->block_start == EXTENT_MAP_INLINE) || | ||
201 | (next->block_start == EXTENT_MAP_DELALLOC && | ||
202 | prev->block_start == EXTENT_MAP_DELALLOC) || | ||
203 | (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && | ||
204 | next->block_start == extent_map_block_end(prev)))) { | ||
205 | return 1; | ||
206 | } | ||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * add_extent_mapping - add new extent map to the extent tree | ||
212 | * @tree: tree to insert new map in | ||
213 | * @em: map to insert | ||
214 | * | ||
215 | * Insert @em into @tree or perform a simple forward/backward merge with | ||
216 | * existing mappings. The extent_map struct passed in will be inserted | ||
217 | * into the tree directly, with an additional reference taken, or a | ||
218 | * reference dropped if the merge attempt was sucessfull. | ||
219 | */ | ||
220 | int add_extent_mapping(struct extent_map_tree *tree, | ||
221 | struct extent_map *em) | ||
222 | { | ||
223 | int ret = 0; | ||
224 | struct extent_map *merge = NULL; | ||
225 | struct rb_node *rb; | ||
226 | struct extent_map *exist; | ||
227 | |||
228 | exist = lookup_extent_mapping(tree, em->start, em->len); | ||
229 | if (exist) { | ||
230 | free_extent_map(exist); | ||
231 | ret = -EEXIST; | ||
232 | goto out; | ||
233 | } | ||
234 | assert_spin_locked(&tree->lock); | ||
235 | rb = tree_insert(&tree->map, em->start, &em->rb_node); | ||
236 | if (rb) { | ||
237 | ret = -EEXIST; | ||
238 | free_extent_map(merge); | ||
239 | goto out; | ||
240 | } | ||
241 | atomic_inc(&em->refs); | ||
242 | if (em->start != 0) { | ||
243 | rb = rb_prev(&em->rb_node); | ||
244 | if (rb) | ||
245 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
246 | if (rb && mergable_maps(merge, em)) { | ||
247 | em->start = merge->start; | ||
248 | em->len += merge->len; | ||
249 | em->block_len += merge->block_len; | ||
250 | em->block_start = merge->block_start; | ||
251 | merge->in_tree = 0; | ||
252 | rb_erase(&merge->rb_node, &tree->map); | ||
253 | free_extent_map(merge); | ||
254 | } | ||
255 | } | ||
256 | rb = rb_next(&em->rb_node); | ||
257 | if (rb) | ||
258 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
259 | if (rb && mergable_maps(em, merge)) { | ||
260 | em->len += merge->len; | ||
261 | em->block_len += merge->len; | ||
262 | rb_erase(&merge->rb_node, &tree->map); | ||
263 | merge->in_tree = 0; | ||
264 | free_extent_map(merge); | ||
265 | } | ||
266 | out: | ||
267 | return ret; | ||
268 | } | ||
269 | EXPORT_SYMBOL(add_extent_mapping); | ||
270 | |||
271 | /* simple helper to do math around the end of an extent, handling wrap */ | ||
272 | static u64 range_end(u64 start, u64 len) | ||
273 | { | ||
274 | if (start + len < start) | ||
275 | return (u64)-1; | ||
276 | return start + len; | ||
277 | } | ||
278 | |||
279 | /** | ||
280 | * lookup_extent_mapping - lookup extent_map | ||
281 | * @tree: tree to lookup in | ||
282 | * @start: byte offset to start the search | ||
283 | * @len: length of the lookup range | ||
284 | * | ||
285 | * Find and return the first extent_map struct in @tree that intersects the | ||
286 | * [start, len] range. There may be additional objects in the tree that | ||
287 | * intersect, so check the object returned carefully to make sure that no | ||
288 | * additional lookups are needed. | ||
289 | */ | ||
290 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | ||
291 | u64 start, u64 len) | ||
292 | { | ||
293 | struct extent_map *em; | ||
294 | struct rb_node *rb_node; | ||
295 | struct rb_node *prev = NULL; | ||
296 | struct rb_node *next = NULL; | ||
297 | u64 end = range_end(start, len); | ||
298 | |||
299 | assert_spin_locked(&tree->lock); | ||
300 | rb_node = __tree_search(&tree->map, start, &prev, &next); | ||
301 | if (!rb_node && prev) { | ||
302 | em = rb_entry(prev, struct extent_map, rb_node); | ||
303 | if (end > em->start && start < extent_map_end(em)) | ||
304 | goto found; | ||
305 | } | ||
306 | if (!rb_node && next) { | ||
307 | em = rb_entry(next, struct extent_map, rb_node); | ||
308 | if (end > em->start && start < extent_map_end(em)) | ||
309 | goto found; | ||
310 | } | ||
311 | if (!rb_node) { | ||
312 | em = NULL; | ||
313 | goto out; | ||
314 | } | ||
315 | if (IS_ERR(rb_node)) { | ||
316 | em = ERR_PTR(PTR_ERR(rb_node)); | ||
317 | goto out; | ||
318 | } | ||
319 | em = rb_entry(rb_node, struct extent_map, rb_node); | ||
320 | if (end > em->start && start < extent_map_end(em)) | ||
321 | goto found; | ||
322 | |||
323 | em = NULL; | ||
324 | goto out; | ||
325 | |||
326 | found: | ||
327 | atomic_inc(&em->refs); | ||
328 | out: | ||
329 | return em; | ||
330 | } | ||
331 | EXPORT_SYMBOL(lookup_extent_mapping); | ||
332 | |||
333 | /** | ||
334 | * remove_extent_mapping - removes an extent_map from the extent tree | ||
335 | * @tree: extent tree to remove from | ||
336 | * @em: extent map beeing removed | ||
337 | * | ||
338 | * Removes @em from @tree. No reference counts are dropped, and no checks | ||
339 | * are done to see if the range is in use | ||
340 | */ | ||
341 | int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) | ||
342 | { | ||
343 | int ret = 0; | ||
344 | |||
345 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); | ||
346 | assert_spin_locked(&tree->lock); | ||
347 | rb_erase(&em->rb_node, &tree->map); | ||
348 | em->in_tree = 0; | ||
349 | return ret; | ||
350 | } | ||
351 | EXPORT_SYMBOL(remove_extent_mapping); | ||
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 000000000000..fb6eeef06bb0 --- /dev/null +++ b/fs/btrfs/extent_map.h | |||
@@ -0,0 +1,62 @@ | |||
1 | #ifndef __EXTENTMAP__ | ||
2 | #define __EXTENTMAP__ | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | |||
6 | #define EXTENT_MAP_LAST_BYTE (u64)-4 | ||
7 | #define EXTENT_MAP_HOLE (u64)-3 | ||
8 | #define EXTENT_MAP_INLINE (u64)-2 | ||
9 | #define EXTENT_MAP_DELALLOC (u64)-1 | ||
10 | |||
11 | /* bits for the flags field */ | ||
12 | #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ | ||
13 | #define EXTENT_FLAG_COMPRESSED 1 | ||
14 | #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ | ||
15 | #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ | ||
16 | |||
17 | struct extent_map { | ||
18 | struct rb_node rb_node; | ||
19 | |||
20 | /* all of these are in bytes */ | ||
21 | u64 start; | ||
22 | u64 len; | ||
23 | u64 orig_start; | ||
24 | u64 block_start; | ||
25 | u64 block_len; | ||
26 | unsigned long flags; | ||
27 | struct block_device *bdev; | ||
28 | atomic_t refs; | ||
29 | int in_tree; | ||
30 | }; | ||
31 | |||
32 | struct extent_map_tree { | ||
33 | struct rb_root map; | ||
34 | spinlock_t lock; | ||
35 | }; | ||
36 | |||
37 | static inline u64 extent_map_end(struct extent_map *em) | ||
38 | { | ||
39 | if (em->start + em->len < em->start) | ||
40 | return (u64)-1; | ||
41 | return em->start + em->len; | ||
42 | } | ||
43 | |||
44 | static inline u64 extent_map_block_end(struct extent_map *em) | ||
45 | { | ||
46 | if (em->block_start + em->block_len < em->block_start) | ||
47 | return (u64)-1; | ||
48 | return em->block_start + em->block_len; | ||
49 | } | ||
50 | |||
51 | void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); | ||
52 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | ||
53 | u64 start, u64 len); | ||
54 | int add_extent_mapping(struct extent_map_tree *tree, | ||
55 | struct extent_map *em); | ||
56 | int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); | ||
57 | |||
58 | struct extent_map *alloc_extent_map(gfp_t mask); | ||
59 | void free_extent_map(struct extent_map *em); | ||
60 | int __init extent_map_init(void); | ||
61 | void extent_map_exit(void); | ||
62 | #endif | ||
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 000000000000..964652435fd1 --- /dev/null +++ b/fs/btrfs/file-item.c | |||
@@ -0,0 +1,831 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/bio.h> | ||
20 | #include <linux/pagemap.h> | ||
21 | #include <linux/highmem.h> | ||
22 | #include "ctree.h" | ||
23 | #include "disk-io.h" | ||
24 | #include "transaction.h" | ||
25 | #include "print-tree.h" | ||
26 | |||
27 | #define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ | ||
28 | sizeof(struct btrfs_item) * 2) / \ | ||
29 | size) - 1)) | ||
30 | |||
31 | #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ | ||
32 | sizeof(struct btrfs_ordered_sum)) / \ | ||
33 | sizeof(struct btrfs_sector_sum) * \ | ||
34 | (r)->sectorsize - (r)->sectorsize) | ||
35 | |||
36 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, | ||
37 | struct btrfs_root *root, | ||
38 | u64 objectid, u64 pos, | ||
39 | u64 disk_offset, u64 disk_num_bytes, | ||
40 | u64 num_bytes, u64 offset, u64 ram_bytes, | ||
41 | u8 compression, u8 encryption, u16 other_encoding) | ||
42 | { | ||
43 | int ret = 0; | ||
44 | struct btrfs_file_extent_item *item; | ||
45 | struct btrfs_key file_key; | ||
46 | struct btrfs_path *path; | ||
47 | struct extent_buffer *leaf; | ||
48 | |||
49 | path = btrfs_alloc_path(); | ||
50 | BUG_ON(!path); | ||
51 | file_key.objectid = objectid; | ||
52 | file_key.offset = pos; | ||
53 | btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); | ||
54 | |||
55 | ret = btrfs_insert_empty_item(trans, root, path, &file_key, | ||
56 | sizeof(*item)); | ||
57 | if (ret < 0) | ||
58 | goto out; | ||
59 | BUG_ON(ret); | ||
60 | leaf = path->nodes[0]; | ||
61 | item = btrfs_item_ptr(leaf, path->slots[0], | ||
62 | struct btrfs_file_extent_item); | ||
63 | btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); | ||
64 | btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); | ||
65 | btrfs_set_file_extent_offset(leaf, item, offset); | ||
66 | btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); | ||
67 | btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); | ||
68 | btrfs_set_file_extent_generation(leaf, item, trans->transid); | ||
69 | btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); | ||
70 | btrfs_set_file_extent_compression(leaf, item, compression); | ||
71 | btrfs_set_file_extent_encryption(leaf, item, encryption); | ||
72 | btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); | ||
73 | |||
74 | btrfs_mark_buffer_dirty(leaf); | ||
75 | out: | ||
76 | btrfs_free_path(path); | ||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, | ||
81 | struct btrfs_root *root, | ||
82 | struct btrfs_path *path, | ||
83 | u64 bytenr, int cow) | ||
84 | { | ||
85 | int ret; | ||
86 | struct btrfs_key file_key; | ||
87 | struct btrfs_key found_key; | ||
88 | struct btrfs_csum_item *item; | ||
89 | struct extent_buffer *leaf; | ||
90 | u64 csum_offset = 0; | ||
91 | u16 csum_size = | ||
92 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
93 | int csums_in_item; | ||
94 | |||
95 | file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
96 | file_key.offset = bytenr; | ||
97 | btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); | ||
98 | ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); | ||
99 | if (ret < 0) | ||
100 | goto fail; | ||
101 | leaf = path->nodes[0]; | ||
102 | if (ret > 0) { | ||
103 | ret = 1; | ||
104 | if (path->slots[0] == 0) | ||
105 | goto fail; | ||
106 | path->slots[0]--; | ||
107 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
108 | if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) | ||
109 | goto fail; | ||
110 | |||
111 | csum_offset = (bytenr - found_key.offset) >> | ||
112 | root->fs_info->sb->s_blocksize_bits; | ||
113 | csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); | ||
114 | csums_in_item /= csum_size; | ||
115 | |||
116 | if (csum_offset >= csums_in_item) { | ||
117 | ret = -EFBIG; | ||
118 | goto fail; | ||
119 | } | ||
120 | } | ||
121 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); | ||
122 | item = (struct btrfs_csum_item *)((unsigned char *)item + | ||
123 | csum_offset * csum_size); | ||
124 | return item; | ||
125 | fail: | ||
126 | if (ret > 0) | ||
127 | ret = -ENOENT; | ||
128 | return ERR_PTR(ret); | ||
129 | } | ||
130 | |||
131 | |||
132 | int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | ||
133 | struct btrfs_root *root, | ||
134 | struct btrfs_path *path, u64 objectid, | ||
135 | u64 offset, int mod) | ||
136 | { | ||
137 | int ret; | ||
138 | struct btrfs_key file_key; | ||
139 | int ins_len = mod < 0 ? -1 : 0; | ||
140 | int cow = mod != 0; | ||
141 | |||
142 | file_key.objectid = objectid; | ||
143 | file_key.offset = offset; | ||
144 | btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); | ||
145 | ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); | ||
146 | return ret; | ||
147 | } | ||
148 | |||
149 | |||
150 | int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, | ||
151 | struct bio *bio, u32 *dst) | ||
152 | { | ||
153 | u32 sum; | ||
154 | struct bio_vec *bvec = bio->bi_io_vec; | ||
155 | int bio_index = 0; | ||
156 | u64 offset; | ||
157 | u64 item_start_offset = 0; | ||
158 | u64 item_last_offset = 0; | ||
159 | u64 disk_bytenr; | ||
160 | u32 diff; | ||
161 | u16 csum_size = | ||
162 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
163 | int ret; | ||
164 | struct btrfs_path *path; | ||
165 | struct btrfs_csum_item *item = NULL; | ||
166 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
167 | |||
168 | path = btrfs_alloc_path(); | ||
169 | if (bio->bi_size > PAGE_CACHE_SIZE * 8) | ||
170 | path->reada = 2; | ||
171 | |||
172 | WARN_ON(bio->bi_vcnt <= 0); | ||
173 | |||
174 | disk_bytenr = (u64)bio->bi_sector << 9; | ||
175 | while (bio_index < bio->bi_vcnt) { | ||
176 | offset = page_offset(bvec->bv_page) + bvec->bv_offset; | ||
177 | ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); | ||
178 | if (ret == 0) | ||
179 | goto found; | ||
180 | |||
181 | if (!item || disk_bytenr < item_start_offset || | ||
182 | disk_bytenr >= item_last_offset) { | ||
183 | struct btrfs_key found_key; | ||
184 | u32 item_size; | ||
185 | |||
186 | if (item) | ||
187 | btrfs_release_path(root, path); | ||
188 | item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, | ||
189 | path, disk_bytenr, 0); | ||
190 | if (IS_ERR(item)) { | ||
191 | ret = PTR_ERR(item); | ||
192 | if (ret == -ENOENT || ret == -EFBIG) | ||
193 | ret = 0; | ||
194 | sum = 0; | ||
195 | if (BTRFS_I(inode)->root->root_key.objectid == | ||
196 | BTRFS_DATA_RELOC_TREE_OBJECTID) { | ||
197 | set_extent_bits(io_tree, offset, | ||
198 | offset + bvec->bv_len - 1, | ||
199 | EXTENT_NODATASUM, GFP_NOFS); | ||
200 | } else { | ||
201 | printk(KERN_INFO "btrfs no csum found " | ||
202 | "for inode %lu start %llu\n", | ||
203 | inode->i_ino, | ||
204 | (unsigned long long)offset); | ||
205 | } | ||
206 | item = NULL; | ||
207 | btrfs_release_path(root, path); | ||
208 | goto found; | ||
209 | } | ||
210 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
211 | path->slots[0]); | ||
212 | |||
213 | item_start_offset = found_key.offset; | ||
214 | item_size = btrfs_item_size_nr(path->nodes[0], | ||
215 | path->slots[0]); | ||
216 | item_last_offset = item_start_offset + | ||
217 | (item_size / csum_size) * | ||
218 | root->sectorsize; | ||
219 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
220 | struct btrfs_csum_item); | ||
221 | } | ||
222 | /* | ||
223 | * this byte range must be able to fit inside | ||
224 | * a single leaf so it will also fit inside a u32 | ||
225 | */ | ||
226 | diff = disk_bytenr - item_start_offset; | ||
227 | diff = diff / root->sectorsize; | ||
228 | diff = diff * csum_size; | ||
229 | |||
230 | read_extent_buffer(path->nodes[0], &sum, | ||
231 | ((unsigned long)item) + diff, | ||
232 | csum_size); | ||
233 | found: | ||
234 | if (dst) | ||
235 | *dst++ = sum; | ||
236 | else | ||
237 | set_state_private(io_tree, offset, sum); | ||
238 | disk_bytenr += bvec->bv_len; | ||
239 | bio_index++; | ||
240 | bvec++; | ||
241 | } | ||
242 | btrfs_free_path(path); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, | ||
247 | struct list_head *list) | ||
248 | { | ||
249 | struct btrfs_key key; | ||
250 | struct btrfs_path *path; | ||
251 | struct extent_buffer *leaf; | ||
252 | struct btrfs_ordered_sum *sums; | ||
253 | struct btrfs_sector_sum *sector_sum; | ||
254 | struct btrfs_csum_item *item; | ||
255 | unsigned long offset; | ||
256 | int ret; | ||
257 | size_t size; | ||
258 | u64 csum_end; | ||
259 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); | ||
260 | |||
261 | path = btrfs_alloc_path(); | ||
262 | BUG_ON(!path); | ||
263 | |||
264 | key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
265 | key.offset = start; | ||
266 | key.type = BTRFS_EXTENT_CSUM_KEY; | ||
267 | |||
268 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
269 | if (ret < 0) | ||
270 | goto fail; | ||
271 | if (ret > 0 && path->slots[0] > 0) { | ||
272 | leaf = path->nodes[0]; | ||
273 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); | ||
274 | if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && | ||
275 | key.type == BTRFS_EXTENT_CSUM_KEY) { | ||
276 | offset = (start - key.offset) >> | ||
277 | root->fs_info->sb->s_blocksize_bits; | ||
278 | if (offset * csum_size < | ||
279 | btrfs_item_size_nr(leaf, path->slots[0] - 1)) | ||
280 | path->slots[0]--; | ||
281 | } | ||
282 | } | ||
283 | |||
284 | while (start <= end) { | ||
285 | leaf = path->nodes[0]; | ||
286 | if (path->slots[0] >= btrfs_header_nritems(leaf)) { | ||
287 | ret = btrfs_next_leaf(root, path); | ||
288 | if (ret < 0) | ||
289 | goto fail; | ||
290 | if (ret > 0) | ||
291 | break; | ||
292 | leaf = path->nodes[0]; | ||
293 | } | ||
294 | |||
295 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
296 | if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || | ||
297 | key.type != BTRFS_EXTENT_CSUM_KEY) | ||
298 | break; | ||
299 | |||
300 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
301 | if (key.offset > end) | ||
302 | break; | ||
303 | |||
304 | if (key.offset > start) | ||
305 | start = key.offset; | ||
306 | |||
307 | size = btrfs_item_size_nr(leaf, path->slots[0]); | ||
308 | csum_end = key.offset + (size / csum_size) * root->sectorsize; | ||
309 | if (csum_end <= start) { | ||
310 | path->slots[0]++; | ||
311 | continue; | ||
312 | } | ||
313 | |||
314 | csum_end = min(csum_end, end + 1); | ||
315 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
316 | struct btrfs_csum_item); | ||
317 | while (start < csum_end) { | ||
318 | size = min_t(size_t, csum_end - start, | ||
319 | MAX_ORDERED_SUM_BYTES(root)); | ||
320 | sums = kzalloc(btrfs_ordered_sum_size(root, size), | ||
321 | GFP_NOFS); | ||
322 | BUG_ON(!sums); | ||
323 | |||
324 | sector_sum = sums->sums; | ||
325 | sums->bytenr = start; | ||
326 | sums->len = size; | ||
327 | |||
328 | offset = (start - key.offset) >> | ||
329 | root->fs_info->sb->s_blocksize_bits; | ||
330 | offset *= csum_size; | ||
331 | |||
332 | while (size > 0) { | ||
333 | read_extent_buffer(path->nodes[0], | ||
334 | §or_sum->sum, | ||
335 | ((unsigned long)item) + | ||
336 | offset, csum_size); | ||
337 | sector_sum->bytenr = start; | ||
338 | |||
339 | size -= root->sectorsize; | ||
340 | start += root->sectorsize; | ||
341 | offset += csum_size; | ||
342 | sector_sum++; | ||
343 | } | ||
344 | list_add_tail(&sums->list, list); | ||
345 | } | ||
346 | path->slots[0]++; | ||
347 | } | ||
348 | ret = 0; | ||
349 | fail: | ||
350 | btrfs_free_path(path); | ||
351 | return ret; | ||
352 | } | ||
353 | |||
354 | int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, | ||
355 | struct bio *bio, u64 file_start, int contig) | ||
356 | { | ||
357 | struct btrfs_ordered_sum *sums; | ||
358 | struct btrfs_sector_sum *sector_sum; | ||
359 | struct btrfs_ordered_extent *ordered; | ||
360 | char *data; | ||
361 | struct bio_vec *bvec = bio->bi_io_vec; | ||
362 | int bio_index = 0; | ||
363 | unsigned long total_bytes = 0; | ||
364 | unsigned long this_sum_bytes = 0; | ||
365 | u64 offset; | ||
366 | u64 disk_bytenr; | ||
367 | |||
368 | WARN_ON(bio->bi_vcnt <= 0); | ||
369 | sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); | ||
370 | if (!sums) | ||
371 | return -ENOMEM; | ||
372 | |||
373 | sector_sum = sums->sums; | ||
374 | disk_bytenr = (u64)bio->bi_sector << 9; | ||
375 | sums->len = bio->bi_size; | ||
376 | INIT_LIST_HEAD(&sums->list); | ||
377 | |||
378 | if (contig) | ||
379 | offset = file_start; | ||
380 | else | ||
381 | offset = page_offset(bvec->bv_page) + bvec->bv_offset; | ||
382 | |||
383 | ordered = btrfs_lookup_ordered_extent(inode, offset); | ||
384 | BUG_ON(!ordered); | ||
385 | sums->bytenr = ordered->start; | ||
386 | |||
387 | while (bio_index < bio->bi_vcnt) { | ||
388 | if (!contig) | ||
389 | offset = page_offset(bvec->bv_page) + bvec->bv_offset; | ||
390 | |||
391 | if (!contig && (offset >= ordered->file_offset + ordered->len || | ||
392 | offset < ordered->file_offset)) { | ||
393 | unsigned long bytes_left; | ||
394 | sums->len = this_sum_bytes; | ||
395 | this_sum_bytes = 0; | ||
396 | btrfs_add_ordered_sum(inode, ordered, sums); | ||
397 | btrfs_put_ordered_extent(ordered); | ||
398 | |||
399 | bytes_left = bio->bi_size - total_bytes; | ||
400 | |||
401 | sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), | ||
402 | GFP_NOFS); | ||
403 | BUG_ON(!sums); | ||
404 | sector_sum = sums->sums; | ||
405 | sums->len = bytes_left; | ||
406 | ordered = btrfs_lookup_ordered_extent(inode, offset); | ||
407 | BUG_ON(!ordered); | ||
408 | sums->bytenr = ordered->start; | ||
409 | } | ||
410 | |||
411 | data = kmap_atomic(bvec->bv_page, KM_USER0); | ||
412 | sector_sum->sum = ~(u32)0; | ||
413 | sector_sum->sum = btrfs_csum_data(root, | ||
414 | data + bvec->bv_offset, | ||
415 | sector_sum->sum, | ||
416 | bvec->bv_len); | ||
417 | kunmap_atomic(data, KM_USER0); | ||
418 | btrfs_csum_final(sector_sum->sum, | ||
419 | (char *)§or_sum->sum); | ||
420 | sector_sum->bytenr = disk_bytenr; | ||
421 | |||
422 | sector_sum++; | ||
423 | bio_index++; | ||
424 | total_bytes += bvec->bv_len; | ||
425 | this_sum_bytes += bvec->bv_len; | ||
426 | disk_bytenr += bvec->bv_len; | ||
427 | offset += bvec->bv_len; | ||
428 | bvec++; | ||
429 | } | ||
430 | this_sum_bytes = 0; | ||
431 | btrfs_add_ordered_sum(inode, ordered, sums); | ||
432 | btrfs_put_ordered_extent(ordered); | ||
433 | return 0; | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * helper function for csum removal, this expects the | ||
438 | * key to describe the csum pointed to by the path, and it expects | ||
439 | * the csum to overlap the range [bytenr, len] | ||
440 | * | ||
441 | * The csum should not be entirely contained in the range and the | ||
442 | * range should not be entirely contained in the csum. | ||
443 | * | ||
444 | * This calls btrfs_truncate_item with the correct args based on the | ||
445 | * overlap, and fixes up the key as required. | ||
446 | */ | ||
447 | static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, | ||
448 | struct btrfs_root *root, | ||
449 | struct btrfs_path *path, | ||
450 | struct btrfs_key *key, | ||
451 | u64 bytenr, u64 len) | ||
452 | { | ||
453 | struct extent_buffer *leaf; | ||
454 | u16 csum_size = | ||
455 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
456 | u64 csum_end; | ||
457 | u64 end_byte = bytenr + len; | ||
458 | u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; | ||
459 | int ret; | ||
460 | |||
461 | leaf = path->nodes[0]; | ||
462 | csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; | ||
463 | csum_end <<= root->fs_info->sb->s_blocksize_bits; | ||
464 | csum_end += key->offset; | ||
465 | |||
466 | if (key->offset < bytenr && csum_end <= end_byte) { | ||
467 | /* | ||
468 | * [ bytenr - len ] | ||
469 | * [ ] | ||
470 | * [csum ] | ||
471 | * A simple truncate off the end of the item | ||
472 | */ | ||
473 | u32 new_size = (bytenr - key->offset) >> blocksize_bits; | ||
474 | new_size *= csum_size; | ||
475 | ret = btrfs_truncate_item(trans, root, path, new_size, 1); | ||
476 | BUG_ON(ret); | ||
477 | } else if (key->offset >= bytenr && csum_end > end_byte && | ||
478 | end_byte > key->offset) { | ||
479 | /* | ||
480 | * [ bytenr - len ] | ||
481 | * [ ] | ||
482 | * [csum ] | ||
483 | * we need to truncate from the beginning of the csum | ||
484 | */ | ||
485 | u32 new_size = (csum_end - end_byte) >> blocksize_bits; | ||
486 | new_size *= csum_size; | ||
487 | |||
488 | ret = btrfs_truncate_item(trans, root, path, new_size, 0); | ||
489 | BUG_ON(ret); | ||
490 | |||
491 | key->offset = end_byte; | ||
492 | ret = btrfs_set_item_key_safe(trans, root, path, key); | ||
493 | BUG_ON(ret); | ||
494 | } else { | ||
495 | BUG(); | ||
496 | } | ||
497 | return 0; | ||
498 | } | ||
499 | |||
500 | /* | ||
501 | * deletes the csum items from the csum tree for a given | ||
502 | * range of bytes. | ||
503 | */ | ||
504 | int btrfs_del_csums(struct btrfs_trans_handle *trans, | ||
505 | struct btrfs_root *root, u64 bytenr, u64 len) | ||
506 | { | ||
507 | struct btrfs_path *path; | ||
508 | struct btrfs_key key; | ||
509 | u64 end_byte = bytenr + len; | ||
510 | u64 csum_end; | ||
511 | struct extent_buffer *leaf; | ||
512 | int ret; | ||
513 | u16 csum_size = | ||
514 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
515 | int blocksize_bits = root->fs_info->sb->s_blocksize_bits; | ||
516 | |||
517 | root = root->fs_info->csum_root; | ||
518 | |||
519 | path = btrfs_alloc_path(); | ||
520 | |||
521 | while (1) { | ||
522 | key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
523 | key.offset = end_byte - 1; | ||
524 | key.type = BTRFS_EXTENT_CSUM_KEY; | ||
525 | |||
526 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
527 | if (ret > 0) { | ||
528 | if (path->slots[0] == 0) | ||
529 | goto out; | ||
530 | path->slots[0]--; | ||
531 | } | ||
532 | leaf = path->nodes[0]; | ||
533 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
534 | |||
535 | if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || | ||
536 | key.type != BTRFS_EXTENT_CSUM_KEY) { | ||
537 | break; | ||
538 | } | ||
539 | |||
540 | if (key.offset >= end_byte) | ||
541 | break; | ||
542 | |||
543 | csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; | ||
544 | csum_end <<= blocksize_bits; | ||
545 | csum_end += key.offset; | ||
546 | |||
547 | /* this csum ends before we start, we're done */ | ||
548 | if (csum_end <= bytenr) | ||
549 | break; | ||
550 | |||
551 | /* delete the entire item, it is inside our range */ | ||
552 | if (key.offset >= bytenr && csum_end <= end_byte) { | ||
553 | ret = btrfs_del_item(trans, root, path); | ||
554 | BUG_ON(ret); | ||
555 | if (key.offset == bytenr) | ||
556 | break; | ||
557 | } else if (key.offset < bytenr && csum_end > end_byte) { | ||
558 | unsigned long offset; | ||
559 | unsigned long shift_len; | ||
560 | unsigned long item_offset; | ||
561 | /* | ||
562 | * [ bytenr - len ] | ||
563 | * [csum ] | ||
564 | * | ||
565 | * Our bytes are in the middle of the csum, | ||
566 | * we need to split this item and insert a new one. | ||
567 | * | ||
568 | * But we can't drop the path because the | ||
569 | * csum could change, get removed, extended etc. | ||
570 | * | ||
571 | * The trick here is the max size of a csum item leaves | ||
572 | * enough room in the tree block for a single | ||
573 | * item header. So, we split the item in place, | ||
574 | * adding a new header pointing to the existing | ||
575 | * bytes. Then we loop around again and we have | ||
576 | * a nicely formed csum item that we can neatly | ||
577 | * truncate. | ||
578 | */ | ||
579 | offset = (bytenr - key.offset) >> blocksize_bits; | ||
580 | offset *= csum_size; | ||
581 | |||
582 | shift_len = (len >> blocksize_bits) * csum_size; | ||
583 | |||
584 | item_offset = btrfs_item_ptr_offset(leaf, | ||
585 | path->slots[0]); | ||
586 | |||
587 | memset_extent_buffer(leaf, 0, item_offset + offset, | ||
588 | shift_len); | ||
589 | key.offset = bytenr; | ||
590 | |||
591 | /* | ||
592 | * btrfs_split_item returns -EAGAIN when the | ||
593 | * item changed size or key | ||
594 | */ | ||
595 | ret = btrfs_split_item(trans, root, path, &key, offset); | ||
596 | BUG_ON(ret && ret != -EAGAIN); | ||
597 | |||
598 | key.offset = end_byte - 1; | ||
599 | } else { | ||
600 | ret = truncate_one_csum(trans, root, path, | ||
601 | &key, bytenr, len); | ||
602 | BUG_ON(ret); | ||
603 | if (key.offset < bytenr) | ||
604 | break; | ||
605 | } | ||
606 | btrfs_release_path(root, path); | ||
607 | } | ||
608 | out: | ||
609 | btrfs_free_path(path); | ||
610 | return 0; | ||
611 | } | ||
612 | |||
613 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | ||
614 | struct btrfs_root *root, | ||
615 | struct btrfs_ordered_sum *sums) | ||
616 | { | ||
617 | u64 bytenr; | ||
618 | int ret; | ||
619 | struct btrfs_key file_key; | ||
620 | struct btrfs_key found_key; | ||
621 | u64 next_offset; | ||
622 | u64 total_bytes = 0; | ||
623 | int found_next; | ||
624 | struct btrfs_path *path; | ||
625 | struct btrfs_csum_item *item; | ||
626 | struct btrfs_csum_item *item_end; | ||
627 | struct extent_buffer *leaf = NULL; | ||
628 | u64 csum_offset; | ||
629 | struct btrfs_sector_sum *sector_sum; | ||
630 | u32 nritems; | ||
631 | u32 ins_size; | ||
632 | char *eb_map; | ||
633 | char *eb_token; | ||
634 | unsigned long map_len; | ||
635 | unsigned long map_start; | ||
636 | u16 csum_size = | ||
637 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
638 | |||
639 | path = btrfs_alloc_path(); | ||
640 | BUG_ON(!path); | ||
641 | sector_sum = sums->sums; | ||
642 | again: | ||
643 | next_offset = (u64)-1; | ||
644 | found_next = 0; | ||
645 | file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
646 | file_key.offset = sector_sum->bytenr; | ||
647 | bytenr = sector_sum->bytenr; | ||
648 | btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); | ||
649 | |||
650 | item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); | ||
651 | if (!IS_ERR(item)) { | ||
652 | leaf = path->nodes[0]; | ||
653 | ret = 0; | ||
654 | goto found; | ||
655 | } | ||
656 | ret = PTR_ERR(item); | ||
657 | if (ret == -EFBIG) { | ||
658 | u32 item_size; | ||
659 | /* we found one, but it isn't big enough yet */ | ||
660 | leaf = path->nodes[0]; | ||
661 | item_size = btrfs_item_size_nr(leaf, path->slots[0]); | ||
662 | if ((item_size / csum_size) >= | ||
663 | MAX_CSUM_ITEMS(root, csum_size)) { | ||
664 | /* already at max size, make a new one */ | ||
665 | goto insert; | ||
666 | } | ||
667 | } else { | ||
668 | int slot = path->slots[0] + 1; | ||
669 | /* we didn't find a csum item, insert one */ | ||
670 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
671 | if (path->slots[0] >= nritems - 1) { | ||
672 | ret = btrfs_next_leaf(root, path); | ||
673 | if (ret == 1) | ||
674 | found_next = 1; | ||
675 | if (ret != 0) | ||
676 | goto insert; | ||
677 | slot = 0; | ||
678 | } | ||
679 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); | ||
680 | if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || | ||
681 | found_key.type != BTRFS_EXTENT_CSUM_KEY) { | ||
682 | found_next = 1; | ||
683 | goto insert; | ||
684 | } | ||
685 | next_offset = found_key.offset; | ||
686 | found_next = 1; | ||
687 | goto insert; | ||
688 | } | ||
689 | |||
690 | /* | ||
691 | * at this point, we know the tree has an item, but it isn't big | ||
692 | * enough yet to put our csum in. Grow it | ||
693 | */ | ||
694 | btrfs_release_path(root, path); | ||
695 | ret = btrfs_search_slot(trans, root, &file_key, path, | ||
696 | csum_size, 1); | ||
697 | if (ret < 0) | ||
698 | goto fail_unlock; | ||
699 | |||
700 | if (ret > 0) { | ||
701 | if (path->slots[0] == 0) | ||
702 | goto insert; | ||
703 | path->slots[0]--; | ||
704 | } | ||
705 | |||
706 | leaf = path->nodes[0]; | ||
707 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
708 | csum_offset = (bytenr - found_key.offset) >> | ||
709 | root->fs_info->sb->s_blocksize_bits; | ||
710 | |||
711 | if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || | ||
712 | found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || | ||
713 | csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { | ||
714 | goto insert; | ||
715 | } | ||
716 | |||
717 | if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / | ||
718 | csum_size) { | ||
719 | u32 diff = (csum_offset + 1) * csum_size; | ||
720 | |||
721 | /* | ||
722 | * is the item big enough already? we dropped our lock | ||
723 | * before and need to recheck | ||
724 | */ | ||
725 | if (diff < btrfs_item_size_nr(leaf, path->slots[0])) | ||
726 | goto csum; | ||
727 | |||
728 | diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); | ||
729 | if (diff != csum_size) | ||
730 | goto insert; | ||
731 | |||
732 | ret = btrfs_extend_item(trans, root, path, diff); | ||
733 | BUG_ON(ret); | ||
734 | goto csum; | ||
735 | } | ||
736 | |||
737 | insert: | ||
738 | btrfs_release_path(root, path); | ||
739 | csum_offset = 0; | ||
740 | if (found_next) { | ||
741 | u64 tmp = total_bytes + root->sectorsize; | ||
742 | u64 next_sector = sector_sum->bytenr; | ||
743 | struct btrfs_sector_sum *next = sector_sum + 1; | ||
744 | |||
745 | while (tmp < sums->len) { | ||
746 | if (next_sector + root->sectorsize != next->bytenr) | ||
747 | break; | ||
748 | tmp += root->sectorsize; | ||
749 | next_sector = next->bytenr; | ||
750 | next++; | ||
751 | } | ||
752 | tmp = min(tmp, next_offset - file_key.offset); | ||
753 | tmp >>= root->fs_info->sb->s_blocksize_bits; | ||
754 | tmp = max((u64)1, tmp); | ||
755 | tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); | ||
756 | ins_size = csum_size * tmp; | ||
757 | } else { | ||
758 | ins_size = csum_size; | ||
759 | } | ||
760 | ret = btrfs_insert_empty_item(trans, root, path, &file_key, | ||
761 | ins_size); | ||
762 | if (ret < 0) | ||
763 | goto fail_unlock; | ||
764 | if (ret != 0) { | ||
765 | WARN_ON(1); | ||
766 | goto fail_unlock; | ||
767 | } | ||
768 | csum: | ||
769 | leaf = path->nodes[0]; | ||
770 | item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); | ||
771 | ret = 0; | ||
772 | item = (struct btrfs_csum_item *)((unsigned char *)item + | ||
773 | csum_offset * csum_size); | ||
774 | found: | ||
775 | item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); | ||
776 | item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + | ||
777 | btrfs_item_size_nr(leaf, path->slots[0])); | ||
778 | eb_token = NULL; | ||
779 | cond_resched(); | ||
780 | next_sector: | ||
781 | |||
782 | if (!eb_token || | ||
783 | (unsigned long)item + csum_size >= map_start + map_len) { | ||
784 | int err; | ||
785 | |||
786 | if (eb_token) | ||
787 | unmap_extent_buffer(leaf, eb_token, KM_USER1); | ||
788 | eb_token = NULL; | ||
789 | err = map_private_extent_buffer(leaf, (unsigned long)item, | ||
790 | csum_size, | ||
791 | &eb_token, &eb_map, | ||
792 | &map_start, &map_len, KM_USER1); | ||
793 | if (err) | ||
794 | eb_token = NULL; | ||
795 | } | ||
796 | if (eb_token) { | ||
797 | memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), | ||
798 | §or_sum->sum, csum_size); | ||
799 | } else { | ||
800 | write_extent_buffer(leaf, §or_sum->sum, | ||
801 | (unsigned long)item, csum_size); | ||
802 | } | ||
803 | |||
804 | total_bytes += root->sectorsize; | ||
805 | sector_sum++; | ||
806 | if (total_bytes < sums->len) { | ||
807 | item = (struct btrfs_csum_item *)((char *)item + | ||
808 | csum_size); | ||
809 | if (item < item_end && bytenr + PAGE_CACHE_SIZE == | ||
810 | sector_sum->bytenr) { | ||
811 | bytenr = sector_sum->bytenr; | ||
812 | goto next_sector; | ||
813 | } | ||
814 | } | ||
815 | if (eb_token) { | ||
816 | unmap_extent_buffer(leaf, eb_token, KM_USER1); | ||
817 | eb_token = NULL; | ||
818 | } | ||
819 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
820 | cond_resched(); | ||
821 | if (total_bytes < sums->len) { | ||
822 | btrfs_release_path(root, path); | ||
823 | goto again; | ||
824 | } | ||
825 | out: | ||
826 | btrfs_free_path(path); | ||
827 | return ret; | ||
828 | |||
829 | fail_unlock: | ||
830 | goto out; | ||
831 | } | ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 000000000000..90268334145e --- /dev/null +++ b/fs/btrfs/file.c | |||
@@ -0,0 +1,1288 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/pagemap.h> | ||
21 | #include <linux/highmem.h> | ||
22 | #include <linux/time.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/string.h> | ||
25 | #include <linux/smp_lock.h> | ||
26 | #include <linux/backing-dev.h> | ||
27 | #include <linux/mpage.h> | ||
28 | #include <linux/swap.h> | ||
29 | #include <linux/writeback.h> | ||
30 | #include <linux/statfs.h> | ||
31 | #include <linux/compat.h> | ||
32 | #include <linux/version.h> | ||
33 | #include "ctree.h" | ||
34 | #include "disk-io.h" | ||
35 | #include "transaction.h" | ||
36 | #include "btrfs_inode.h" | ||
37 | #include "ioctl.h" | ||
38 | #include "print-tree.h" | ||
39 | #include "tree-log.h" | ||
40 | #include "locking.h" | ||
41 | #include "compat.h" | ||
42 | |||
43 | |||
44 | /* simple helper to fault in pages and copy. This should go away | ||
45 | * and be replaced with calls into generic code. | ||
46 | */ | ||
47 | static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, | ||
48 | int write_bytes, | ||
49 | struct page **prepared_pages, | ||
50 | const char __user *buf) | ||
51 | { | ||
52 | long page_fault = 0; | ||
53 | int i; | ||
54 | int offset = pos & (PAGE_CACHE_SIZE - 1); | ||
55 | |||
56 | for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { | ||
57 | size_t count = min_t(size_t, | ||
58 | PAGE_CACHE_SIZE - offset, write_bytes); | ||
59 | struct page *page = prepared_pages[i]; | ||
60 | fault_in_pages_readable(buf, count); | ||
61 | |||
62 | /* Copy data from userspace to the current page */ | ||
63 | kmap(page); | ||
64 | page_fault = __copy_from_user(page_address(page) + offset, | ||
65 | buf, count); | ||
66 | /* Flush processor's dcache for this page */ | ||
67 | flush_dcache_page(page); | ||
68 | kunmap(page); | ||
69 | buf += count; | ||
70 | write_bytes -= count; | ||
71 | |||
72 | if (page_fault) | ||
73 | break; | ||
74 | } | ||
75 | return page_fault ? -EFAULT : 0; | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * unlocks pages after btrfs_file_write is done with them | ||
80 | */ | ||
81 | static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) | ||
82 | { | ||
83 | size_t i; | ||
84 | for (i = 0; i < num_pages; i++) { | ||
85 | if (!pages[i]) | ||
86 | break; | ||
87 | /* page checked is some magic around finding pages that | ||
88 | * have been modified without going through btrfs_set_page_dirty | ||
89 | * clear it here | ||
90 | */ | ||
91 | ClearPageChecked(pages[i]); | ||
92 | unlock_page(pages[i]); | ||
93 | mark_page_accessed(pages[i]); | ||
94 | page_cache_release(pages[i]); | ||
95 | } | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * after copy_from_user, pages need to be dirtied and we need to make | ||
100 | * sure holes are created between the current EOF and the start of | ||
101 | * any next extents (if required). | ||
102 | * | ||
103 | * this also makes the decision about creating an inline extent vs | ||
104 | * doing real data extents, marking pages dirty and delalloc as required. | ||
105 | */ | ||
106 | static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, | ||
107 | struct btrfs_root *root, | ||
108 | struct file *file, | ||
109 | struct page **pages, | ||
110 | size_t num_pages, | ||
111 | loff_t pos, | ||
112 | size_t write_bytes) | ||
113 | { | ||
114 | int err = 0; | ||
115 | int i; | ||
116 | struct inode *inode = fdentry(file)->d_inode; | ||
117 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
118 | u64 hint_byte; | ||
119 | u64 num_bytes; | ||
120 | u64 start_pos; | ||
121 | u64 end_of_last_block; | ||
122 | u64 end_pos = pos + write_bytes; | ||
123 | loff_t isize = i_size_read(inode); | ||
124 | |||
125 | start_pos = pos & ~((u64)root->sectorsize - 1); | ||
126 | num_bytes = (write_bytes + pos - start_pos + | ||
127 | root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | ||
128 | |||
129 | end_of_last_block = start_pos + num_bytes - 1; | ||
130 | |||
131 | lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); | ||
132 | trans = btrfs_join_transaction(root, 1); | ||
133 | if (!trans) { | ||
134 | err = -ENOMEM; | ||
135 | goto out_unlock; | ||
136 | } | ||
137 | btrfs_set_trans_block_group(trans, inode); | ||
138 | hint_byte = 0; | ||
139 | |||
140 | set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); | ||
141 | |||
142 | /* check for reserved extents on each page, we don't want | ||
143 | * to reset the delalloc bit on things that already have | ||
144 | * extents reserved. | ||
145 | */ | ||
146 | btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); | ||
147 | for (i = 0; i < num_pages; i++) { | ||
148 | struct page *p = pages[i]; | ||
149 | SetPageUptodate(p); | ||
150 | ClearPageChecked(p); | ||
151 | set_page_dirty(p); | ||
152 | } | ||
153 | if (end_pos > isize) { | ||
154 | i_size_write(inode, end_pos); | ||
155 | btrfs_update_inode(trans, root, inode); | ||
156 | } | ||
157 | err = btrfs_end_transaction(trans, root); | ||
158 | out_unlock: | ||
159 | unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); | ||
160 | return err; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * this drops all the extents in the cache that intersect the range | ||
165 | * [start, end]. Existing extents are split as required. | ||
166 | */ | ||
167 | int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | ||
168 | int skip_pinned) | ||
169 | { | ||
170 | struct extent_map *em; | ||
171 | struct extent_map *split = NULL; | ||
172 | struct extent_map *split2 = NULL; | ||
173 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
174 | u64 len = end - start + 1; | ||
175 | int ret; | ||
176 | int testend = 1; | ||
177 | unsigned long flags; | ||
178 | int compressed = 0; | ||
179 | |||
180 | WARN_ON(end < start); | ||
181 | if (end == (u64)-1) { | ||
182 | len = (u64)-1; | ||
183 | testend = 0; | ||
184 | } | ||
185 | while (1) { | ||
186 | if (!split) | ||
187 | split = alloc_extent_map(GFP_NOFS); | ||
188 | if (!split2) | ||
189 | split2 = alloc_extent_map(GFP_NOFS); | ||
190 | |||
191 | spin_lock(&em_tree->lock); | ||
192 | em = lookup_extent_mapping(em_tree, start, len); | ||
193 | if (!em) { | ||
194 | spin_unlock(&em_tree->lock); | ||
195 | break; | ||
196 | } | ||
197 | flags = em->flags; | ||
198 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { | ||
199 | spin_unlock(&em_tree->lock); | ||
200 | if (em->start <= start && | ||
201 | (!testend || em->start + em->len >= start + len)) { | ||
202 | free_extent_map(em); | ||
203 | break; | ||
204 | } | ||
205 | if (start < em->start) { | ||
206 | len = em->start - start; | ||
207 | } else { | ||
208 | len = start + len - (em->start + em->len); | ||
209 | start = em->start + em->len; | ||
210 | } | ||
211 | free_extent_map(em); | ||
212 | continue; | ||
213 | } | ||
214 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
215 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
216 | remove_extent_mapping(em_tree, em); | ||
217 | |||
218 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | ||
219 | em->start < start) { | ||
220 | split->start = em->start; | ||
221 | split->len = start - em->start; | ||
222 | split->orig_start = em->orig_start; | ||
223 | split->block_start = em->block_start; | ||
224 | |||
225 | if (compressed) | ||
226 | split->block_len = em->block_len; | ||
227 | else | ||
228 | split->block_len = split->len; | ||
229 | |||
230 | split->bdev = em->bdev; | ||
231 | split->flags = flags; | ||
232 | ret = add_extent_mapping(em_tree, split); | ||
233 | BUG_ON(ret); | ||
234 | free_extent_map(split); | ||
235 | split = split2; | ||
236 | split2 = NULL; | ||
237 | } | ||
238 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | ||
239 | testend && em->start + em->len > start + len) { | ||
240 | u64 diff = start + len - em->start; | ||
241 | |||
242 | split->start = start + len; | ||
243 | split->len = em->start + em->len - (start + len); | ||
244 | split->bdev = em->bdev; | ||
245 | split->flags = flags; | ||
246 | |||
247 | if (compressed) { | ||
248 | split->block_len = em->block_len; | ||
249 | split->block_start = em->block_start; | ||
250 | split->orig_start = em->orig_start; | ||
251 | } else { | ||
252 | split->block_len = split->len; | ||
253 | split->block_start = em->block_start + diff; | ||
254 | split->orig_start = split->start; | ||
255 | } | ||
256 | |||
257 | ret = add_extent_mapping(em_tree, split); | ||
258 | BUG_ON(ret); | ||
259 | free_extent_map(split); | ||
260 | split = NULL; | ||
261 | } | ||
262 | spin_unlock(&em_tree->lock); | ||
263 | |||
264 | /* once for us */ | ||
265 | free_extent_map(em); | ||
266 | /* once for the tree*/ | ||
267 | free_extent_map(em); | ||
268 | } | ||
269 | if (split) | ||
270 | free_extent_map(split); | ||
271 | if (split2) | ||
272 | free_extent_map(split2); | ||
273 | return 0; | ||
274 | } | ||
275 | |||
276 | int btrfs_check_file(struct btrfs_root *root, struct inode *inode) | ||
277 | { | ||
278 | return 0; | ||
279 | #if 0 | ||
280 | struct btrfs_path *path; | ||
281 | struct btrfs_key found_key; | ||
282 | struct extent_buffer *leaf; | ||
283 | struct btrfs_file_extent_item *extent; | ||
284 | u64 last_offset = 0; | ||
285 | int nritems; | ||
286 | int slot; | ||
287 | int found_type; | ||
288 | int ret; | ||
289 | int err = 0; | ||
290 | u64 extent_end = 0; | ||
291 | |||
292 | path = btrfs_alloc_path(); | ||
293 | ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, | ||
294 | last_offset, 0); | ||
295 | while (1) { | ||
296 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
297 | if (path->slots[0] >= nritems) { | ||
298 | ret = btrfs_next_leaf(root, path); | ||
299 | if (ret) | ||
300 | goto out; | ||
301 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
302 | } | ||
303 | slot = path->slots[0]; | ||
304 | leaf = path->nodes[0]; | ||
305 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
306 | if (found_key.objectid != inode->i_ino) | ||
307 | break; | ||
308 | if (found_key.type != BTRFS_EXTENT_DATA_KEY) | ||
309 | goto out; | ||
310 | |||
311 | if (found_key.offset < last_offset) { | ||
312 | WARN_ON(1); | ||
313 | btrfs_print_leaf(root, leaf); | ||
314 | printk(KERN_ERR "inode %lu found offset %llu " | ||
315 | "expected %llu\n", inode->i_ino, | ||
316 | (unsigned long long)found_key.offset, | ||
317 | (unsigned long long)last_offset); | ||
318 | err = 1; | ||
319 | goto out; | ||
320 | } | ||
321 | extent = btrfs_item_ptr(leaf, slot, | ||
322 | struct btrfs_file_extent_item); | ||
323 | found_type = btrfs_file_extent_type(leaf, extent); | ||
324 | if (found_type == BTRFS_FILE_EXTENT_REG) { | ||
325 | extent_end = found_key.offset + | ||
326 | btrfs_file_extent_num_bytes(leaf, extent); | ||
327 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
328 | struct btrfs_item *item; | ||
329 | item = btrfs_item_nr(leaf, slot); | ||
330 | extent_end = found_key.offset + | ||
331 | btrfs_file_extent_inline_len(leaf, extent); | ||
332 | extent_end = (extent_end + root->sectorsize - 1) & | ||
333 | ~((u64)root->sectorsize - 1); | ||
334 | } | ||
335 | last_offset = extent_end; | ||
336 | path->slots[0]++; | ||
337 | } | ||
338 | if (0 && last_offset < inode->i_size) { | ||
339 | WARN_ON(1); | ||
340 | btrfs_print_leaf(root, leaf); | ||
341 | printk(KERN_ERR "inode %lu found offset %llu size %llu\n", | ||
342 | inode->i_ino, (unsigned long long)last_offset, | ||
343 | (unsigned long long)inode->i_size); | ||
344 | err = 1; | ||
345 | |||
346 | } | ||
347 | out: | ||
348 | btrfs_free_path(path); | ||
349 | return err; | ||
350 | #endif | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * this is very complex, but the basic idea is to drop all extents | ||
355 | * in the range start - end. hint_block is filled in with a block number | ||
356 | * that would be a good hint to the block allocator for this file. | ||
357 | * | ||
358 | * If an extent intersects the range but is not entirely inside the range | ||
359 | * it is either truncated or split. Anything entirely inside the range | ||
360 | * is deleted from the tree. | ||
361 | * | ||
362 | * inline_limit is used to tell this code which offsets in the file to keep | ||
363 | * if they contain inline extents. | ||
364 | */ | ||
365 | noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, | ||
366 | struct btrfs_root *root, struct inode *inode, | ||
367 | u64 start, u64 end, u64 inline_limit, u64 *hint_byte) | ||
368 | { | ||
369 | u64 extent_end = 0; | ||
370 | u64 locked_end = end; | ||
371 | u64 search_start = start; | ||
372 | u64 leaf_start; | ||
373 | u64 ram_bytes = 0; | ||
374 | u64 orig_parent = 0; | ||
375 | u64 disk_bytenr = 0; | ||
376 | u8 compression; | ||
377 | u8 encryption; | ||
378 | u16 other_encoding = 0; | ||
379 | u64 root_gen; | ||
380 | u64 root_owner; | ||
381 | struct extent_buffer *leaf; | ||
382 | struct btrfs_file_extent_item *extent; | ||
383 | struct btrfs_path *path; | ||
384 | struct btrfs_key key; | ||
385 | struct btrfs_file_extent_item old; | ||
386 | int keep; | ||
387 | int slot; | ||
388 | int bookend; | ||
389 | int found_type = 0; | ||
390 | int found_extent; | ||
391 | int found_inline; | ||
392 | int recow; | ||
393 | int ret; | ||
394 | |||
395 | inline_limit = 0; | ||
396 | btrfs_drop_extent_cache(inode, start, end - 1, 0); | ||
397 | |||
398 | path = btrfs_alloc_path(); | ||
399 | if (!path) | ||
400 | return -ENOMEM; | ||
401 | while (1) { | ||
402 | recow = 0; | ||
403 | btrfs_release_path(root, path); | ||
404 | ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, | ||
405 | search_start, -1); | ||
406 | if (ret < 0) | ||
407 | goto out; | ||
408 | if (ret > 0) { | ||
409 | if (path->slots[0] == 0) { | ||
410 | ret = 0; | ||
411 | goto out; | ||
412 | } | ||
413 | path->slots[0]--; | ||
414 | } | ||
415 | next_slot: | ||
416 | keep = 0; | ||
417 | bookend = 0; | ||
418 | found_extent = 0; | ||
419 | found_inline = 0; | ||
420 | leaf_start = 0; | ||
421 | root_gen = 0; | ||
422 | root_owner = 0; | ||
423 | compression = 0; | ||
424 | encryption = 0; | ||
425 | extent = NULL; | ||
426 | leaf = path->nodes[0]; | ||
427 | slot = path->slots[0]; | ||
428 | ret = 0; | ||
429 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
430 | if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && | ||
431 | key.offset >= end) { | ||
432 | goto out; | ||
433 | } | ||
434 | if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || | ||
435 | key.objectid != inode->i_ino) { | ||
436 | goto out; | ||
437 | } | ||
438 | if (recow) { | ||
439 | search_start = max(key.offset, start); | ||
440 | continue; | ||
441 | } | ||
442 | if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { | ||
443 | extent = btrfs_item_ptr(leaf, slot, | ||
444 | struct btrfs_file_extent_item); | ||
445 | found_type = btrfs_file_extent_type(leaf, extent); | ||
446 | compression = btrfs_file_extent_compression(leaf, | ||
447 | extent); | ||
448 | encryption = btrfs_file_extent_encryption(leaf, | ||
449 | extent); | ||
450 | other_encoding = btrfs_file_extent_other_encoding(leaf, | ||
451 | extent); | ||
452 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
453 | found_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
454 | extent_end = | ||
455 | btrfs_file_extent_disk_bytenr(leaf, | ||
456 | extent); | ||
457 | if (extent_end) | ||
458 | *hint_byte = extent_end; | ||
459 | |||
460 | extent_end = key.offset + | ||
461 | btrfs_file_extent_num_bytes(leaf, extent); | ||
462 | ram_bytes = btrfs_file_extent_ram_bytes(leaf, | ||
463 | extent); | ||
464 | found_extent = 1; | ||
465 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
466 | found_inline = 1; | ||
467 | extent_end = key.offset + | ||
468 | btrfs_file_extent_inline_len(leaf, extent); | ||
469 | } | ||
470 | } else { | ||
471 | extent_end = search_start; | ||
472 | } | ||
473 | |||
474 | /* we found nothing we can drop */ | ||
475 | if ((!found_extent && !found_inline) || | ||
476 | search_start >= extent_end) { | ||
477 | int nextret; | ||
478 | u32 nritems; | ||
479 | nritems = btrfs_header_nritems(leaf); | ||
480 | if (slot >= nritems - 1) { | ||
481 | nextret = btrfs_next_leaf(root, path); | ||
482 | if (nextret) | ||
483 | goto out; | ||
484 | recow = 1; | ||
485 | } else { | ||
486 | path->slots[0]++; | ||
487 | } | ||
488 | goto next_slot; | ||
489 | } | ||
490 | |||
491 | if (end <= extent_end && start >= key.offset && found_inline) | ||
492 | *hint_byte = EXTENT_MAP_INLINE; | ||
493 | |||
494 | if (found_extent) { | ||
495 | read_extent_buffer(leaf, &old, (unsigned long)extent, | ||
496 | sizeof(old)); | ||
497 | root_gen = btrfs_header_generation(leaf); | ||
498 | root_owner = btrfs_header_owner(leaf); | ||
499 | leaf_start = leaf->start; | ||
500 | } | ||
501 | |||
502 | if (end < extent_end && end >= key.offset) { | ||
503 | bookend = 1; | ||
504 | if (found_inline && start <= key.offset) | ||
505 | keep = 1; | ||
506 | } | ||
507 | |||
508 | if (bookend && found_extent) { | ||
509 | if (locked_end < extent_end) { | ||
510 | ret = try_lock_extent(&BTRFS_I(inode)->io_tree, | ||
511 | locked_end, extent_end - 1, | ||
512 | GFP_NOFS); | ||
513 | if (!ret) { | ||
514 | btrfs_release_path(root, path); | ||
515 | lock_extent(&BTRFS_I(inode)->io_tree, | ||
516 | locked_end, extent_end - 1, | ||
517 | GFP_NOFS); | ||
518 | locked_end = extent_end; | ||
519 | continue; | ||
520 | } | ||
521 | locked_end = extent_end; | ||
522 | } | ||
523 | orig_parent = path->nodes[0]->start; | ||
524 | disk_bytenr = le64_to_cpu(old.disk_bytenr); | ||
525 | if (disk_bytenr != 0) { | ||
526 | ret = btrfs_inc_extent_ref(trans, root, | ||
527 | disk_bytenr, | ||
528 | le64_to_cpu(old.disk_num_bytes), | ||
529 | orig_parent, root->root_key.objectid, | ||
530 | trans->transid, inode->i_ino); | ||
531 | BUG_ON(ret); | ||
532 | } | ||
533 | } | ||
534 | |||
535 | if (found_inline) { | ||
536 | u64 mask = root->sectorsize - 1; | ||
537 | search_start = (extent_end + mask) & ~mask; | ||
538 | } else | ||
539 | search_start = extent_end; | ||
540 | |||
541 | /* truncate existing extent */ | ||
542 | if (start > key.offset) { | ||
543 | u64 new_num; | ||
544 | u64 old_num; | ||
545 | keep = 1; | ||
546 | WARN_ON(start & (root->sectorsize - 1)); | ||
547 | if (found_extent) { | ||
548 | new_num = start - key.offset; | ||
549 | old_num = btrfs_file_extent_num_bytes(leaf, | ||
550 | extent); | ||
551 | *hint_byte = | ||
552 | btrfs_file_extent_disk_bytenr(leaf, | ||
553 | extent); | ||
554 | if (btrfs_file_extent_disk_bytenr(leaf, | ||
555 | extent)) { | ||
556 | inode_sub_bytes(inode, old_num - | ||
557 | new_num); | ||
558 | } | ||
559 | btrfs_set_file_extent_num_bytes(leaf, | ||
560 | extent, new_num); | ||
561 | btrfs_mark_buffer_dirty(leaf); | ||
562 | } else if (key.offset < inline_limit && | ||
563 | (end > extent_end) && | ||
564 | (inline_limit < extent_end)) { | ||
565 | u32 new_size; | ||
566 | new_size = btrfs_file_extent_calc_inline_size( | ||
567 | inline_limit - key.offset); | ||
568 | inode_sub_bytes(inode, extent_end - | ||
569 | inline_limit); | ||
570 | btrfs_set_file_extent_ram_bytes(leaf, extent, | ||
571 | new_size); | ||
572 | if (!compression && !encryption) { | ||
573 | btrfs_truncate_item(trans, root, path, | ||
574 | new_size, 1); | ||
575 | } | ||
576 | } | ||
577 | } | ||
578 | /* delete the entire extent */ | ||
579 | if (!keep) { | ||
580 | if (found_inline) | ||
581 | inode_sub_bytes(inode, extent_end - | ||
582 | key.offset); | ||
583 | ret = btrfs_del_item(trans, root, path); | ||
584 | /* TODO update progress marker and return */ | ||
585 | BUG_ON(ret); | ||
586 | extent = NULL; | ||
587 | btrfs_release_path(root, path); | ||
588 | /* the extent will be freed later */ | ||
589 | } | ||
590 | if (bookend && found_inline && start <= key.offset) { | ||
591 | u32 new_size; | ||
592 | new_size = btrfs_file_extent_calc_inline_size( | ||
593 | extent_end - end); | ||
594 | inode_sub_bytes(inode, end - key.offset); | ||
595 | btrfs_set_file_extent_ram_bytes(leaf, extent, | ||
596 | new_size); | ||
597 | if (!compression && !encryption) | ||
598 | ret = btrfs_truncate_item(trans, root, path, | ||
599 | new_size, 0); | ||
600 | BUG_ON(ret); | ||
601 | } | ||
602 | /* create bookend, splitting the extent in two */ | ||
603 | if (bookend && found_extent) { | ||
604 | struct btrfs_key ins; | ||
605 | ins.objectid = inode->i_ino; | ||
606 | ins.offset = end; | ||
607 | btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); | ||
608 | |||
609 | btrfs_release_path(root, path); | ||
610 | ret = btrfs_insert_empty_item(trans, root, path, &ins, | ||
611 | sizeof(*extent)); | ||
612 | BUG_ON(ret); | ||
613 | |||
614 | leaf = path->nodes[0]; | ||
615 | extent = btrfs_item_ptr(leaf, path->slots[0], | ||
616 | struct btrfs_file_extent_item); | ||
617 | write_extent_buffer(leaf, &old, | ||
618 | (unsigned long)extent, sizeof(old)); | ||
619 | |||
620 | btrfs_set_file_extent_compression(leaf, extent, | ||
621 | compression); | ||
622 | btrfs_set_file_extent_encryption(leaf, extent, | ||
623 | encryption); | ||
624 | btrfs_set_file_extent_other_encoding(leaf, extent, | ||
625 | other_encoding); | ||
626 | btrfs_set_file_extent_offset(leaf, extent, | ||
627 | le64_to_cpu(old.offset) + end - key.offset); | ||
628 | WARN_ON(le64_to_cpu(old.num_bytes) < | ||
629 | (extent_end - end)); | ||
630 | btrfs_set_file_extent_num_bytes(leaf, extent, | ||
631 | extent_end - end); | ||
632 | |||
633 | /* | ||
634 | * set the ram bytes to the size of the full extent | ||
635 | * before splitting. This is a worst case flag, | ||
636 | * but its the best we can do because we don't know | ||
637 | * how splitting affects compression | ||
638 | */ | ||
639 | btrfs_set_file_extent_ram_bytes(leaf, extent, | ||
640 | ram_bytes); | ||
641 | btrfs_set_file_extent_type(leaf, extent, found_type); | ||
642 | |||
643 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
644 | |||
645 | if (disk_bytenr != 0) { | ||
646 | ret = btrfs_update_extent_ref(trans, root, | ||
647 | disk_bytenr, orig_parent, | ||
648 | leaf->start, | ||
649 | root->root_key.objectid, | ||
650 | trans->transid, ins.objectid); | ||
651 | |||
652 | BUG_ON(ret); | ||
653 | } | ||
654 | btrfs_release_path(root, path); | ||
655 | if (disk_bytenr != 0) | ||
656 | inode_add_bytes(inode, extent_end - end); | ||
657 | } | ||
658 | |||
659 | if (found_extent && !keep) { | ||
660 | u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); | ||
661 | |||
662 | if (old_disk_bytenr != 0) { | ||
663 | inode_sub_bytes(inode, | ||
664 | le64_to_cpu(old.num_bytes)); | ||
665 | ret = btrfs_free_extent(trans, root, | ||
666 | old_disk_bytenr, | ||
667 | le64_to_cpu(old.disk_num_bytes), | ||
668 | leaf_start, root_owner, | ||
669 | root_gen, key.objectid, 0); | ||
670 | BUG_ON(ret); | ||
671 | *hint_byte = old_disk_bytenr; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | if (search_start >= end) { | ||
676 | ret = 0; | ||
677 | goto out; | ||
678 | } | ||
679 | } | ||
680 | out: | ||
681 | btrfs_free_path(path); | ||
682 | if (locked_end > end) { | ||
683 | unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, | ||
684 | GFP_NOFS); | ||
685 | } | ||
686 | btrfs_check_file(root, inode); | ||
687 | return ret; | ||
688 | } | ||
689 | |||
690 | static int extent_mergeable(struct extent_buffer *leaf, int slot, | ||
691 | u64 objectid, u64 bytenr, u64 *start, u64 *end) | ||
692 | { | ||
693 | struct btrfs_file_extent_item *fi; | ||
694 | struct btrfs_key key; | ||
695 | u64 extent_end; | ||
696 | |||
697 | if (slot < 0 || slot >= btrfs_header_nritems(leaf)) | ||
698 | return 0; | ||
699 | |||
700 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
701 | if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) | ||
702 | return 0; | ||
703 | |||
704 | fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); | ||
705 | if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || | ||
706 | btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || | ||
707 | btrfs_file_extent_compression(leaf, fi) || | ||
708 | btrfs_file_extent_encryption(leaf, fi) || | ||
709 | btrfs_file_extent_other_encoding(leaf, fi)) | ||
710 | return 0; | ||
711 | |||
712 | extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); | ||
713 | if ((*start && *start != key.offset) || (*end && *end != extent_end)) | ||
714 | return 0; | ||
715 | |||
716 | *start = key.offset; | ||
717 | *end = extent_end; | ||
718 | return 1; | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * Mark extent in the range start - end as written. | ||
723 | * | ||
724 | * This changes extent type from 'pre-allocated' to 'regular'. If only | ||
725 | * part of extent is marked as written, the extent will be split into | ||
726 | * two or three. | ||
727 | */ | ||
728 | int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, | ||
729 | struct btrfs_root *root, | ||
730 | struct inode *inode, u64 start, u64 end) | ||
731 | { | ||
732 | struct extent_buffer *leaf; | ||
733 | struct btrfs_path *path; | ||
734 | struct btrfs_file_extent_item *fi; | ||
735 | struct btrfs_key key; | ||
736 | u64 bytenr; | ||
737 | u64 num_bytes; | ||
738 | u64 extent_end; | ||
739 | u64 extent_offset; | ||
740 | u64 other_start; | ||
741 | u64 other_end; | ||
742 | u64 split = start; | ||
743 | u64 locked_end = end; | ||
744 | u64 orig_parent; | ||
745 | int extent_type; | ||
746 | int split_end = 1; | ||
747 | int ret; | ||
748 | |||
749 | btrfs_drop_extent_cache(inode, start, end - 1, 0); | ||
750 | |||
751 | path = btrfs_alloc_path(); | ||
752 | BUG_ON(!path); | ||
753 | again: | ||
754 | key.objectid = inode->i_ino; | ||
755 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
756 | if (split == start) | ||
757 | key.offset = split; | ||
758 | else | ||
759 | key.offset = split - 1; | ||
760 | |||
761 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
762 | if (ret > 0 && path->slots[0] > 0) | ||
763 | path->slots[0]--; | ||
764 | |||
765 | leaf = path->nodes[0]; | ||
766 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
767 | BUG_ON(key.objectid != inode->i_ino || | ||
768 | key.type != BTRFS_EXTENT_DATA_KEY); | ||
769 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
770 | struct btrfs_file_extent_item); | ||
771 | extent_type = btrfs_file_extent_type(leaf, fi); | ||
772 | BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); | ||
773 | extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); | ||
774 | BUG_ON(key.offset > start || extent_end < end); | ||
775 | |||
776 | bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); | ||
777 | num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); | ||
778 | extent_offset = btrfs_file_extent_offset(leaf, fi); | ||
779 | |||
780 | if (key.offset == start) | ||
781 | split = end; | ||
782 | |||
783 | if (key.offset == start && extent_end == end) { | ||
784 | int del_nr = 0; | ||
785 | int del_slot = 0; | ||
786 | u64 leaf_owner = btrfs_header_owner(leaf); | ||
787 | u64 leaf_gen = btrfs_header_generation(leaf); | ||
788 | other_start = end; | ||
789 | other_end = 0; | ||
790 | if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, | ||
791 | bytenr, &other_start, &other_end)) { | ||
792 | extent_end = other_end; | ||
793 | del_slot = path->slots[0] + 1; | ||
794 | del_nr++; | ||
795 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, | ||
796 | leaf->start, leaf_owner, | ||
797 | leaf_gen, inode->i_ino, 0); | ||
798 | BUG_ON(ret); | ||
799 | } | ||
800 | other_start = 0; | ||
801 | other_end = start; | ||
802 | if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, | ||
803 | bytenr, &other_start, &other_end)) { | ||
804 | key.offset = other_start; | ||
805 | del_slot = path->slots[0]; | ||
806 | del_nr++; | ||
807 | ret = btrfs_free_extent(trans, root, bytenr, num_bytes, | ||
808 | leaf->start, leaf_owner, | ||
809 | leaf_gen, inode->i_ino, 0); | ||
810 | BUG_ON(ret); | ||
811 | } | ||
812 | split_end = 0; | ||
813 | if (del_nr == 0) { | ||
814 | btrfs_set_file_extent_type(leaf, fi, | ||
815 | BTRFS_FILE_EXTENT_REG); | ||
816 | goto done; | ||
817 | } | ||
818 | |||
819 | fi = btrfs_item_ptr(leaf, del_slot - 1, | ||
820 | struct btrfs_file_extent_item); | ||
821 | btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); | ||
822 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
823 | extent_end - key.offset); | ||
824 | btrfs_mark_buffer_dirty(leaf); | ||
825 | |||
826 | ret = btrfs_del_items(trans, root, path, del_slot, del_nr); | ||
827 | BUG_ON(ret); | ||
828 | goto done; | ||
829 | } else if (split == start) { | ||
830 | if (locked_end < extent_end) { | ||
831 | ret = try_lock_extent(&BTRFS_I(inode)->io_tree, | ||
832 | locked_end, extent_end - 1, GFP_NOFS); | ||
833 | if (!ret) { | ||
834 | btrfs_release_path(root, path); | ||
835 | lock_extent(&BTRFS_I(inode)->io_tree, | ||
836 | locked_end, extent_end - 1, GFP_NOFS); | ||
837 | locked_end = extent_end; | ||
838 | goto again; | ||
839 | } | ||
840 | locked_end = extent_end; | ||
841 | } | ||
842 | btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); | ||
843 | extent_offset += split - key.offset; | ||
844 | } else { | ||
845 | BUG_ON(key.offset != start); | ||
846 | btrfs_set_file_extent_offset(leaf, fi, extent_offset + | ||
847 | split - key.offset); | ||
848 | btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); | ||
849 | key.offset = split; | ||
850 | btrfs_set_item_key_safe(trans, root, path, &key); | ||
851 | extent_end = split; | ||
852 | } | ||
853 | |||
854 | if (extent_end == end) { | ||
855 | split_end = 0; | ||
856 | extent_type = BTRFS_FILE_EXTENT_REG; | ||
857 | } | ||
858 | if (extent_end == end && split == start) { | ||
859 | other_start = end; | ||
860 | other_end = 0; | ||
861 | if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, | ||
862 | bytenr, &other_start, &other_end)) { | ||
863 | path->slots[0]++; | ||
864 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
865 | struct btrfs_file_extent_item); | ||
866 | key.offset = split; | ||
867 | btrfs_set_item_key_safe(trans, root, path, &key); | ||
868 | btrfs_set_file_extent_offset(leaf, fi, extent_offset); | ||
869 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
870 | other_end - split); | ||
871 | goto done; | ||
872 | } | ||
873 | } | ||
874 | if (extent_end == end && split == end) { | ||
875 | other_start = 0; | ||
876 | other_end = start; | ||
877 | if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, | ||
878 | bytenr, &other_start, &other_end)) { | ||
879 | path->slots[0]--; | ||
880 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
881 | struct btrfs_file_extent_item); | ||
882 | btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - | ||
883 | other_start); | ||
884 | goto done; | ||
885 | } | ||
886 | } | ||
887 | |||
888 | btrfs_mark_buffer_dirty(leaf); | ||
889 | |||
890 | orig_parent = leaf->start; | ||
891 | ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, | ||
892 | orig_parent, root->root_key.objectid, | ||
893 | trans->transid, inode->i_ino); | ||
894 | BUG_ON(ret); | ||
895 | btrfs_release_path(root, path); | ||
896 | |||
897 | key.offset = start; | ||
898 | ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); | ||
899 | BUG_ON(ret); | ||
900 | |||
901 | leaf = path->nodes[0]; | ||
902 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
903 | struct btrfs_file_extent_item); | ||
904 | btrfs_set_file_extent_generation(leaf, fi, trans->transid); | ||
905 | btrfs_set_file_extent_type(leaf, fi, extent_type); | ||
906 | btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); | ||
907 | btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); | ||
908 | btrfs_set_file_extent_offset(leaf, fi, extent_offset); | ||
909 | btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); | ||
910 | btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); | ||
911 | btrfs_set_file_extent_compression(leaf, fi, 0); | ||
912 | btrfs_set_file_extent_encryption(leaf, fi, 0); | ||
913 | btrfs_set_file_extent_other_encoding(leaf, fi, 0); | ||
914 | |||
915 | if (orig_parent != leaf->start) { | ||
916 | ret = btrfs_update_extent_ref(trans, root, bytenr, | ||
917 | orig_parent, leaf->start, | ||
918 | root->root_key.objectid, | ||
919 | trans->transid, inode->i_ino); | ||
920 | BUG_ON(ret); | ||
921 | } | ||
922 | done: | ||
923 | btrfs_mark_buffer_dirty(leaf); | ||
924 | btrfs_release_path(root, path); | ||
925 | if (split_end && split == start) { | ||
926 | split = end; | ||
927 | goto again; | ||
928 | } | ||
929 | if (locked_end > end) { | ||
930 | unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, | ||
931 | GFP_NOFS); | ||
932 | } | ||
933 | btrfs_free_path(path); | ||
934 | return 0; | ||
935 | } | ||
936 | |||
937 | /* | ||
938 | * this gets pages into the page cache and locks them down, it also properly | ||
939 | * waits for data=ordered extents to finish before allowing the pages to be | ||
940 | * modified. | ||
941 | */ | ||
942 | static noinline int prepare_pages(struct btrfs_root *root, struct file *file, | ||
943 | struct page **pages, size_t num_pages, | ||
944 | loff_t pos, unsigned long first_index, | ||
945 | unsigned long last_index, size_t write_bytes) | ||
946 | { | ||
947 | int i; | ||
948 | unsigned long index = pos >> PAGE_CACHE_SHIFT; | ||
949 | struct inode *inode = fdentry(file)->d_inode; | ||
950 | int err = 0; | ||
951 | u64 start_pos; | ||
952 | u64 last_pos; | ||
953 | |||
954 | start_pos = pos & ~((u64)root->sectorsize - 1); | ||
955 | last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; | ||
956 | |||
957 | if (start_pos > inode->i_size) { | ||
958 | err = btrfs_cont_expand(inode, start_pos); | ||
959 | if (err) | ||
960 | return err; | ||
961 | } | ||
962 | |||
963 | memset(pages, 0, num_pages * sizeof(struct page *)); | ||
964 | again: | ||
965 | for (i = 0; i < num_pages; i++) { | ||
966 | pages[i] = grab_cache_page(inode->i_mapping, index + i); | ||
967 | if (!pages[i]) { | ||
968 | err = -ENOMEM; | ||
969 | BUG_ON(1); | ||
970 | } | ||
971 | wait_on_page_writeback(pages[i]); | ||
972 | } | ||
973 | if (start_pos < inode->i_size) { | ||
974 | struct btrfs_ordered_extent *ordered; | ||
975 | lock_extent(&BTRFS_I(inode)->io_tree, | ||
976 | start_pos, last_pos - 1, GFP_NOFS); | ||
977 | ordered = btrfs_lookup_first_ordered_extent(inode, | ||
978 | last_pos - 1); | ||
979 | if (ordered && | ||
980 | ordered->file_offset + ordered->len > start_pos && | ||
981 | ordered->file_offset < last_pos) { | ||
982 | btrfs_put_ordered_extent(ordered); | ||
983 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
984 | start_pos, last_pos - 1, GFP_NOFS); | ||
985 | for (i = 0; i < num_pages; i++) { | ||
986 | unlock_page(pages[i]); | ||
987 | page_cache_release(pages[i]); | ||
988 | } | ||
989 | btrfs_wait_ordered_range(inode, start_pos, | ||
990 | last_pos - start_pos); | ||
991 | goto again; | ||
992 | } | ||
993 | if (ordered) | ||
994 | btrfs_put_ordered_extent(ordered); | ||
995 | |||
996 | clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, | ||
997 | last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, | ||
998 | GFP_NOFS); | ||
999 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
1000 | start_pos, last_pos - 1, GFP_NOFS); | ||
1001 | } | ||
1002 | for (i = 0; i < num_pages; i++) { | ||
1003 | clear_page_dirty_for_io(pages[i]); | ||
1004 | set_page_extent_mapped(pages[i]); | ||
1005 | WARN_ON(!PageLocked(pages[i])); | ||
1006 | } | ||
1007 | return 0; | ||
1008 | } | ||
1009 | |||
1010 | static ssize_t btrfs_file_write(struct file *file, const char __user *buf, | ||
1011 | size_t count, loff_t *ppos) | ||
1012 | { | ||
1013 | loff_t pos; | ||
1014 | loff_t start_pos; | ||
1015 | ssize_t num_written = 0; | ||
1016 | ssize_t err = 0; | ||
1017 | int ret = 0; | ||
1018 | struct inode *inode = fdentry(file)->d_inode; | ||
1019 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1020 | struct page **pages = NULL; | ||
1021 | int nrptrs; | ||
1022 | struct page *pinned[2]; | ||
1023 | unsigned long first_index; | ||
1024 | unsigned long last_index; | ||
1025 | int will_write; | ||
1026 | |||
1027 | will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || | ||
1028 | (file->f_flags & O_DIRECT)); | ||
1029 | |||
1030 | nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, | ||
1031 | PAGE_CACHE_SIZE / (sizeof(struct page *))); | ||
1032 | pinned[0] = NULL; | ||
1033 | pinned[1] = NULL; | ||
1034 | |||
1035 | pos = *ppos; | ||
1036 | start_pos = pos; | ||
1037 | |||
1038 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
1039 | current->backing_dev_info = inode->i_mapping->backing_dev_info; | ||
1040 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | ||
1041 | if (err) | ||
1042 | goto out_nolock; | ||
1043 | if (count == 0) | ||
1044 | goto out_nolock; | ||
1045 | |||
1046 | err = file_remove_suid(file); | ||
1047 | if (err) | ||
1048 | goto out_nolock; | ||
1049 | file_update_time(file); | ||
1050 | |||
1051 | pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); | ||
1052 | |||
1053 | mutex_lock(&inode->i_mutex); | ||
1054 | BTRFS_I(inode)->sequence++; | ||
1055 | first_index = pos >> PAGE_CACHE_SHIFT; | ||
1056 | last_index = (pos + count) >> PAGE_CACHE_SHIFT; | ||
1057 | |||
1058 | /* | ||
1059 | * there are lots of better ways to do this, but this code | ||
1060 | * makes sure the first and last page in the file range are | ||
1061 | * up to date and ready for cow | ||
1062 | */ | ||
1063 | if ((pos & (PAGE_CACHE_SIZE - 1))) { | ||
1064 | pinned[0] = grab_cache_page(inode->i_mapping, first_index); | ||
1065 | if (!PageUptodate(pinned[0])) { | ||
1066 | ret = btrfs_readpage(NULL, pinned[0]); | ||
1067 | BUG_ON(ret); | ||
1068 | wait_on_page_locked(pinned[0]); | ||
1069 | } else { | ||
1070 | unlock_page(pinned[0]); | ||
1071 | } | ||
1072 | } | ||
1073 | if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { | ||
1074 | pinned[1] = grab_cache_page(inode->i_mapping, last_index); | ||
1075 | if (!PageUptodate(pinned[1])) { | ||
1076 | ret = btrfs_readpage(NULL, pinned[1]); | ||
1077 | BUG_ON(ret); | ||
1078 | wait_on_page_locked(pinned[1]); | ||
1079 | } else { | ||
1080 | unlock_page(pinned[1]); | ||
1081 | } | ||
1082 | } | ||
1083 | |||
1084 | while (count > 0) { | ||
1085 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1086 | size_t write_bytes = min(count, nrptrs * | ||
1087 | (size_t)PAGE_CACHE_SIZE - | ||
1088 | offset); | ||
1089 | size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> | ||
1090 | PAGE_CACHE_SHIFT; | ||
1091 | |||
1092 | WARN_ON(num_pages > nrptrs); | ||
1093 | memset(pages, 0, sizeof(struct page *) * nrptrs); | ||
1094 | |||
1095 | ret = btrfs_check_free_space(root, write_bytes, 0); | ||
1096 | if (ret) | ||
1097 | goto out; | ||
1098 | |||
1099 | ret = prepare_pages(root, file, pages, num_pages, | ||
1100 | pos, first_index, last_index, | ||
1101 | write_bytes); | ||
1102 | if (ret) | ||
1103 | goto out; | ||
1104 | |||
1105 | ret = btrfs_copy_from_user(pos, num_pages, | ||
1106 | write_bytes, pages, buf); | ||
1107 | if (ret) { | ||
1108 | btrfs_drop_pages(pages, num_pages); | ||
1109 | goto out; | ||
1110 | } | ||
1111 | |||
1112 | ret = dirty_and_release_pages(NULL, root, file, pages, | ||
1113 | num_pages, pos, write_bytes); | ||
1114 | btrfs_drop_pages(pages, num_pages); | ||
1115 | if (ret) | ||
1116 | goto out; | ||
1117 | |||
1118 | if (will_write) { | ||
1119 | btrfs_fdatawrite_range(inode->i_mapping, pos, | ||
1120 | pos + write_bytes - 1, | ||
1121 | WB_SYNC_NONE); | ||
1122 | } else { | ||
1123 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, | ||
1124 | num_pages); | ||
1125 | if (num_pages < | ||
1126 | (root->leafsize >> PAGE_CACHE_SHIFT) + 1) | ||
1127 | btrfs_btree_balance_dirty(root, 1); | ||
1128 | btrfs_throttle(root); | ||
1129 | } | ||
1130 | |||
1131 | buf += write_bytes; | ||
1132 | count -= write_bytes; | ||
1133 | pos += write_bytes; | ||
1134 | num_written += write_bytes; | ||
1135 | |||
1136 | cond_resched(); | ||
1137 | } | ||
1138 | out: | ||
1139 | mutex_unlock(&inode->i_mutex); | ||
1140 | |||
1141 | out_nolock: | ||
1142 | kfree(pages); | ||
1143 | if (pinned[0]) | ||
1144 | page_cache_release(pinned[0]); | ||
1145 | if (pinned[1]) | ||
1146 | page_cache_release(pinned[1]); | ||
1147 | *ppos = pos; | ||
1148 | |||
1149 | if (num_written > 0 && will_write) { | ||
1150 | struct btrfs_trans_handle *trans; | ||
1151 | |||
1152 | err = btrfs_wait_ordered_range(inode, start_pos, num_written); | ||
1153 | if (err) | ||
1154 | num_written = err; | ||
1155 | |||
1156 | if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { | ||
1157 | trans = btrfs_start_transaction(root, 1); | ||
1158 | ret = btrfs_log_dentry_safe(trans, root, | ||
1159 | file->f_dentry); | ||
1160 | if (ret == 0) { | ||
1161 | btrfs_sync_log(trans, root); | ||
1162 | btrfs_end_transaction(trans, root); | ||
1163 | } else { | ||
1164 | btrfs_commit_transaction(trans, root); | ||
1165 | } | ||
1166 | } | ||
1167 | if (file->f_flags & O_DIRECT) { | ||
1168 | invalidate_mapping_pages(inode->i_mapping, | ||
1169 | start_pos >> PAGE_CACHE_SHIFT, | ||
1170 | (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); | ||
1171 | } | ||
1172 | } | ||
1173 | current->backing_dev_info = NULL; | ||
1174 | return num_written ? num_written : err; | ||
1175 | } | ||
1176 | |||
1177 | int btrfs_release_file(struct inode *inode, struct file *filp) | ||
1178 | { | ||
1179 | if (filp->private_data) | ||
1180 | btrfs_ioctl_trans_end(filp); | ||
1181 | return 0; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * fsync call for both files and directories. This logs the inode into | ||
1186 | * the tree log instead of forcing full commits whenever possible. | ||
1187 | * | ||
1188 | * It needs to call filemap_fdatawait so that all ordered extent updates are | ||
1189 | * in the metadata btree are up to date for copying to the log. | ||
1190 | * | ||
1191 | * It drops the inode mutex before doing the tree log commit. This is an | ||
1192 | * important optimization for directories because holding the mutex prevents | ||
1193 | * new operations on the dir while we write to disk. | ||
1194 | */ | ||
1195 | int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) | ||
1196 | { | ||
1197 | struct inode *inode = dentry->d_inode; | ||
1198 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1199 | int ret = 0; | ||
1200 | struct btrfs_trans_handle *trans; | ||
1201 | |||
1202 | /* | ||
1203 | * check the transaction that last modified this inode | ||
1204 | * and see if its already been committed | ||
1205 | */ | ||
1206 | if (!BTRFS_I(inode)->last_trans) | ||
1207 | goto out; | ||
1208 | |||
1209 | mutex_lock(&root->fs_info->trans_mutex); | ||
1210 | if (BTRFS_I(inode)->last_trans <= | ||
1211 | root->fs_info->last_trans_committed) { | ||
1212 | BTRFS_I(inode)->last_trans = 0; | ||
1213 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1214 | goto out; | ||
1215 | } | ||
1216 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1217 | |||
1218 | root->fs_info->tree_log_batch++; | ||
1219 | filemap_fdatawrite(inode->i_mapping); | ||
1220 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
1221 | root->fs_info->tree_log_batch++; | ||
1222 | |||
1223 | /* | ||
1224 | * ok we haven't committed the transaction yet, lets do a commit | ||
1225 | */ | ||
1226 | if (file->private_data) | ||
1227 | btrfs_ioctl_trans_end(file); | ||
1228 | |||
1229 | trans = btrfs_start_transaction(root, 1); | ||
1230 | if (!trans) { | ||
1231 | ret = -ENOMEM; | ||
1232 | goto out; | ||
1233 | } | ||
1234 | |||
1235 | ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); | ||
1236 | if (ret < 0) | ||
1237 | goto out; | ||
1238 | |||
1239 | /* we've logged all the items and now have a consistent | ||
1240 | * version of the file in the log. It is possible that | ||
1241 | * someone will come in and modify the file, but that's | ||
1242 | * fine because the log is consistent on disk, and we | ||
1243 | * have references to all of the file's extents | ||
1244 | * | ||
1245 | * It is possible that someone will come in and log the | ||
1246 | * file again, but that will end up using the synchronization | ||
1247 | * inside btrfs_sync_log to keep things safe. | ||
1248 | */ | ||
1249 | mutex_unlock(&file->f_dentry->d_inode->i_mutex); | ||
1250 | |||
1251 | if (ret > 0) { | ||
1252 | ret = btrfs_commit_transaction(trans, root); | ||
1253 | } else { | ||
1254 | btrfs_sync_log(trans, root); | ||
1255 | ret = btrfs_end_transaction(trans, root); | ||
1256 | } | ||
1257 | mutex_lock(&file->f_dentry->d_inode->i_mutex); | ||
1258 | out: | ||
1259 | return ret > 0 ? EIO : ret; | ||
1260 | } | ||
1261 | |||
1262 | static struct vm_operations_struct btrfs_file_vm_ops = { | ||
1263 | .fault = filemap_fault, | ||
1264 | .page_mkwrite = btrfs_page_mkwrite, | ||
1265 | }; | ||
1266 | |||
1267 | static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) | ||
1268 | { | ||
1269 | vma->vm_ops = &btrfs_file_vm_ops; | ||
1270 | file_accessed(filp); | ||
1271 | return 0; | ||
1272 | } | ||
1273 | |||
1274 | struct file_operations btrfs_file_operations = { | ||
1275 | .llseek = generic_file_llseek, | ||
1276 | .read = do_sync_read, | ||
1277 | .aio_read = generic_file_aio_read, | ||
1278 | .splice_read = generic_file_splice_read, | ||
1279 | .write = btrfs_file_write, | ||
1280 | .mmap = btrfs_file_mmap, | ||
1281 | .open = generic_file_open, | ||
1282 | .release = btrfs_release_file, | ||
1283 | .fsync = btrfs_sync_file, | ||
1284 | .unlocked_ioctl = btrfs_ioctl, | ||
1285 | #ifdef CONFIG_COMPAT | ||
1286 | .compat_ioctl = btrfs_ioctl, | ||
1287 | #endif | ||
1288 | }; | ||
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 000000000000..d1e5f0e84c58 --- /dev/null +++ b/fs/btrfs/free-space-cache.c | |||
@@ -0,0 +1,495 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include "ctree.h" | ||
21 | |||
22 | static int tree_insert_offset(struct rb_root *root, u64 offset, | ||
23 | struct rb_node *node) | ||
24 | { | ||
25 | struct rb_node **p = &root->rb_node; | ||
26 | struct rb_node *parent = NULL; | ||
27 | struct btrfs_free_space *info; | ||
28 | |||
29 | while (*p) { | ||
30 | parent = *p; | ||
31 | info = rb_entry(parent, struct btrfs_free_space, offset_index); | ||
32 | |||
33 | if (offset < info->offset) | ||
34 | p = &(*p)->rb_left; | ||
35 | else if (offset > info->offset) | ||
36 | p = &(*p)->rb_right; | ||
37 | else | ||
38 | return -EEXIST; | ||
39 | } | ||
40 | |||
41 | rb_link_node(node, parent, p); | ||
42 | rb_insert_color(node, root); | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static int tree_insert_bytes(struct rb_root *root, u64 bytes, | ||
48 | struct rb_node *node) | ||
49 | { | ||
50 | struct rb_node **p = &root->rb_node; | ||
51 | struct rb_node *parent = NULL; | ||
52 | struct btrfs_free_space *info; | ||
53 | |||
54 | while (*p) { | ||
55 | parent = *p; | ||
56 | info = rb_entry(parent, struct btrfs_free_space, bytes_index); | ||
57 | |||
58 | if (bytes < info->bytes) | ||
59 | p = &(*p)->rb_left; | ||
60 | else | ||
61 | p = &(*p)->rb_right; | ||
62 | } | ||
63 | |||
64 | rb_link_node(node, parent, p); | ||
65 | rb_insert_color(node, root); | ||
66 | |||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * searches the tree for the given offset. If contains is set we will return | ||
72 | * the free space that contains the given offset. If contains is not set we | ||
73 | * will return the free space that starts at or after the given offset and is | ||
74 | * at least bytes long. | ||
75 | */ | ||
76 | static struct btrfs_free_space *tree_search_offset(struct rb_root *root, | ||
77 | u64 offset, u64 bytes, | ||
78 | int contains) | ||
79 | { | ||
80 | struct rb_node *n = root->rb_node; | ||
81 | struct btrfs_free_space *entry, *ret = NULL; | ||
82 | |||
83 | while (n) { | ||
84 | entry = rb_entry(n, struct btrfs_free_space, offset_index); | ||
85 | |||
86 | if (offset < entry->offset) { | ||
87 | if (!contains && | ||
88 | (!ret || entry->offset < ret->offset) && | ||
89 | (bytes <= entry->bytes)) | ||
90 | ret = entry; | ||
91 | n = n->rb_left; | ||
92 | } else if (offset > entry->offset) { | ||
93 | if ((entry->offset + entry->bytes - 1) >= offset && | ||
94 | bytes <= entry->bytes) { | ||
95 | ret = entry; | ||
96 | break; | ||
97 | } | ||
98 | n = n->rb_right; | ||
99 | } else { | ||
100 | if (bytes > entry->bytes) { | ||
101 | n = n->rb_right; | ||
102 | continue; | ||
103 | } | ||
104 | ret = entry; | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * return a chunk at least bytes size, as close to offset that we can get. | ||
114 | */ | ||
115 | static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, | ||
116 | u64 offset, u64 bytes) | ||
117 | { | ||
118 | struct rb_node *n = root->rb_node; | ||
119 | struct btrfs_free_space *entry, *ret = NULL; | ||
120 | |||
121 | while (n) { | ||
122 | entry = rb_entry(n, struct btrfs_free_space, bytes_index); | ||
123 | |||
124 | if (bytes < entry->bytes) { | ||
125 | /* | ||
126 | * We prefer to get a hole size as close to the size we | ||
127 | * are asking for so we don't take small slivers out of | ||
128 | * huge holes, but we also want to get as close to the | ||
129 | * offset as possible so we don't have a whole lot of | ||
130 | * fragmentation. | ||
131 | */ | ||
132 | if (offset <= entry->offset) { | ||
133 | if (!ret) | ||
134 | ret = entry; | ||
135 | else if (entry->bytes < ret->bytes) | ||
136 | ret = entry; | ||
137 | else if (entry->offset < ret->offset) | ||
138 | ret = entry; | ||
139 | } | ||
140 | n = n->rb_left; | ||
141 | } else if (bytes > entry->bytes) { | ||
142 | n = n->rb_right; | ||
143 | } else { | ||
144 | /* | ||
145 | * Ok we may have multiple chunks of the wanted size, | ||
146 | * so we don't want to take the first one we find, we | ||
147 | * want to take the one closest to our given offset, so | ||
148 | * keep searching just in case theres a better match. | ||
149 | */ | ||
150 | n = n->rb_right; | ||
151 | if (offset > entry->offset) | ||
152 | continue; | ||
153 | else if (!ret || entry->offset < ret->offset) | ||
154 | ret = entry; | ||
155 | } | ||
156 | } | ||
157 | |||
158 | return ret; | ||
159 | } | ||
160 | |||
161 | static void unlink_free_space(struct btrfs_block_group_cache *block_group, | ||
162 | struct btrfs_free_space *info) | ||
163 | { | ||
164 | rb_erase(&info->offset_index, &block_group->free_space_offset); | ||
165 | rb_erase(&info->bytes_index, &block_group->free_space_bytes); | ||
166 | } | ||
167 | |||
168 | static int link_free_space(struct btrfs_block_group_cache *block_group, | ||
169 | struct btrfs_free_space *info) | ||
170 | { | ||
171 | int ret = 0; | ||
172 | |||
173 | |||
174 | ret = tree_insert_offset(&block_group->free_space_offset, info->offset, | ||
175 | &info->offset_index); | ||
176 | if (ret) | ||
177 | return ret; | ||
178 | |||
179 | ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, | ||
180 | &info->bytes_index); | ||
181 | if (ret) | ||
182 | return ret; | ||
183 | |||
184 | return ret; | ||
185 | } | ||
186 | |||
187 | static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, | ||
188 | u64 offset, u64 bytes) | ||
189 | { | ||
190 | struct btrfs_free_space *right_info; | ||
191 | struct btrfs_free_space *left_info; | ||
192 | struct btrfs_free_space *info = NULL; | ||
193 | struct btrfs_free_space *alloc_info; | ||
194 | int ret = 0; | ||
195 | |||
196 | alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); | ||
197 | if (!alloc_info) | ||
198 | return -ENOMEM; | ||
199 | |||
200 | /* | ||
201 | * first we want to see if there is free space adjacent to the range we | ||
202 | * are adding, if there is remove that struct and add a new one to | ||
203 | * cover the entire range | ||
204 | */ | ||
205 | right_info = tree_search_offset(&block_group->free_space_offset, | ||
206 | offset+bytes, 0, 1); | ||
207 | left_info = tree_search_offset(&block_group->free_space_offset, | ||
208 | offset-1, 0, 1); | ||
209 | |||
210 | if (right_info && right_info->offset == offset+bytes) { | ||
211 | unlink_free_space(block_group, right_info); | ||
212 | info = right_info; | ||
213 | info->offset = offset; | ||
214 | info->bytes += bytes; | ||
215 | } else if (right_info && right_info->offset != offset+bytes) { | ||
216 | printk(KERN_ERR "btrfs adding space in the middle of an " | ||
217 | "existing free space area. existing: " | ||
218 | "offset=%llu, bytes=%llu. new: offset=%llu, " | ||
219 | "bytes=%llu\n", (unsigned long long)right_info->offset, | ||
220 | (unsigned long long)right_info->bytes, | ||
221 | (unsigned long long)offset, | ||
222 | (unsigned long long)bytes); | ||
223 | BUG(); | ||
224 | } | ||
225 | |||
226 | if (left_info) { | ||
227 | unlink_free_space(block_group, left_info); | ||
228 | |||
229 | if (unlikely((left_info->offset + left_info->bytes) != | ||
230 | offset)) { | ||
231 | printk(KERN_ERR "btrfs free space to the left " | ||
232 | "of new free space isn't " | ||
233 | "quite right. existing: offset=%llu, " | ||
234 | "bytes=%llu. new: offset=%llu, bytes=%llu\n", | ||
235 | (unsigned long long)left_info->offset, | ||
236 | (unsigned long long)left_info->bytes, | ||
237 | (unsigned long long)offset, | ||
238 | (unsigned long long)bytes); | ||
239 | BUG(); | ||
240 | } | ||
241 | |||
242 | if (info) { | ||
243 | info->offset = left_info->offset; | ||
244 | info->bytes += left_info->bytes; | ||
245 | kfree(left_info); | ||
246 | } else { | ||
247 | info = left_info; | ||
248 | info->bytes += bytes; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | if (info) { | ||
253 | ret = link_free_space(block_group, info); | ||
254 | if (!ret) | ||
255 | info = NULL; | ||
256 | goto out; | ||
257 | } | ||
258 | |||
259 | info = alloc_info; | ||
260 | alloc_info = NULL; | ||
261 | info->offset = offset; | ||
262 | info->bytes = bytes; | ||
263 | |||
264 | ret = link_free_space(block_group, info); | ||
265 | if (ret) | ||
266 | kfree(info); | ||
267 | out: | ||
268 | if (ret) { | ||
269 | printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); | ||
270 | if (ret == -EEXIST) | ||
271 | BUG(); | ||
272 | } | ||
273 | |||
274 | kfree(alloc_info); | ||
275 | |||
276 | return ret; | ||
277 | } | ||
278 | |||
279 | static int | ||
280 | __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, | ||
281 | u64 offset, u64 bytes) | ||
282 | { | ||
283 | struct btrfs_free_space *info; | ||
284 | int ret = 0; | ||
285 | |||
286 | info = tree_search_offset(&block_group->free_space_offset, offset, 0, | ||
287 | 1); | ||
288 | |||
289 | if (info && info->offset == offset) { | ||
290 | if (info->bytes < bytes) { | ||
291 | printk(KERN_ERR "Found free space at %llu, size %llu," | ||
292 | "trying to use %llu\n", | ||
293 | (unsigned long long)info->offset, | ||
294 | (unsigned long long)info->bytes, | ||
295 | (unsigned long long)bytes); | ||
296 | WARN_ON(1); | ||
297 | ret = -EINVAL; | ||
298 | goto out; | ||
299 | } | ||
300 | unlink_free_space(block_group, info); | ||
301 | |||
302 | if (info->bytes == bytes) { | ||
303 | kfree(info); | ||
304 | goto out; | ||
305 | } | ||
306 | |||
307 | info->offset += bytes; | ||
308 | info->bytes -= bytes; | ||
309 | |||
310 | ret = link_free_space(block_group, info); | ||
311 | BUG_ON(ret); | ||
312 | } else if (info && info->offset < offset && | ||
313 | info->offset + info->bytes >= offset + bytes) { | ||
314 | u64 old_start = info->offset; | ||
315 | /* | ||
316 | * we're freeing space in the middle of the info, | ||
317 | * this can happen during tree log replay | ||
318 | * | ||
319 | * first unlink the old info and then | ||
320 | * insert it again after the hole we're creating | ||
321 | */ | ||
322 | unlink_free_space(block_group, info); | ||
323 | if (offset + bytes < info->offset + info->bytes) { | ||
324 | u64 old_end = info->offset + info->bytes; | ||
325 | |||
326 | info->offset = offset + bytes; | ||
327 | info->bytes = old_end - info->offset; | ||
328 | ret = link_free_space(block_group, info); | ||
329 | BUG_ON(ret); | ||
330 | } else { | ||
331 | /* the hole we're creating ends at the end | ||
332 | * of the info struct, just free the info | ||
333 | */ | ||
334 | kfree(info); | ||
335 | } | ||
336 | |||
337 | /* step two, insert a new info struct to cover anything | ||
338 | * before the hole | ||
339 | */ | ||
340 | ret = __btrfs_add_free_space(block_group, old_start, | ||
341 | offset - old_start); | ||
342 | BUG_ON(ret); | ||
343 | } else { | ||
344 | WARN_ON(1); | ||
345 | } | ||
346 | out: | ||
347 | return ret; | ||
348 | } | ||
349 | |||
350 | int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, | ||
351 | u64 offset, u64 bytes) | ||
352 | { | ||
353 | int ret; | ||
354 | struct btrfs_free_space *sp; | ||
355 | |||
356 | mutex_lock(&block_group->alloc_mutex); | ||
357 | ret = __btrfs_add_free_space(block_group, offset, bytes); | ||
358 | sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); | ||
359 | BUG_ON(!sp); | ||
360 | mutex_unlock(&block_group->alloc_mutex); | ||
361 | |||
362 | return ret; | ||
363 | } | ||
364 | |||
365 | int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, | ||
366 | u64 offset, u64 bytes) | ||
367 | { | ||
368 | int ret; | ||
369 | struct btrfs_free_space *sp; | ||
370 | |||
371 | ret = __btrfs_add_free_space(block_group, offset, bytes); | ||
372 | sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); | ||
373 | BUG_ON(!sp); | ||
374 | |||
375 | return ret; | ||
376 | } | ||
377 | |||
378 | int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, | ||
379 | u64 offset, u64 bytes) | ||
380 | { | ||
381 | int ret = 0; | ||
382 | |||
383 | mutex_lock(&block_group->alloc_mutex); | ||
384 | ret = __btrfs_remove_free_space(block_group, offset, bytes); | ||
385 | mutex_unlock(&block_group->alloc_mutex); | ||
386 | |||
387 | return ret; | ||
388 | } | ||
389 | |||
390 | int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, | ||
391 | u64 offset, u64 bytes) | ||
392 | { | ||
393 | int ret; | ||
394 | |||
395 | ret = __btrfs_remove_free_space(block_group, offset, bytes); | ||
396 | |||
397 | return ret; | ||
398 | } | ||
399 | |||
400 | void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, | ||
401 | u64 bytes) | ||
402 | { | ||
403 | struct btrfs_free_space *info; | ||
404 | struct rb_node *n; | ||
405 | int count = 0; | ||
406 | |||
407 | for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { | ||
408 | info = rb_entry(n, struct btrfs_free_space, offset_index); | ||
409 | if (info->bytes >= bytes) | ||
410 | count++; | ||
411 | } | ||
412 | printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" | ||
413 | "\n", count); | ||
414 | } | ||
415 | |||
416 | u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) | ||
417 | { | ||
418 | struct btrfs_free_space *info; | ||
419 | struct rb_node *n; | ||
420 | u64 ret = 0; | ||
421 | |||
422 | for (n = rb_first(&block_group->free_space_offset); n; | ||
423 | n = rb_next(n)) { | ||
424 | info = rb_entry(n, struct btrfs_free_space, offset_index); | ||
425 | ret += info->bytes; | ||
426 | } | ||
427 | |||
428 | return ret; | ||
429 | } | ||
430 | |||
431 | void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) | ||
432 | { | ||
433 | struct btrfs_free_space *info; | ||
434 | struct rb_node *node; | ||
435 | |||
436 | mutex_lock(&block_group->alloc_mutex); | ||
437 | while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { | ||
438 | info = rb_entry(node, struct btrfs_free_space, bytes_index); | ||
439 | unlink_free_space(block_group, info); | ||
440 | kfree(info); | ||
441 | if (need_resched()) { | ||
442 | mutex_unlock(&block_group->alloc_mutex); | ||
443 | cond_resched(); | ||
444 | mutex_lock(&block_group->alloc_mutex); | ||
445 | } | ||
446 | } | ||
447 | mutex_unlock(&block_group->alloc_mutex); | ||
448 | } | ||
449 | |||
450 | #if 0 | ||
451 | static struct btrfs_free_space *btrfs_find_free_space_offset(struct | ||
452 | btrfs_block_group_cache | ||
453 | *block_group, u64 offset, | ||
454 | u64 bytes) | ||
455 | { | ||
456 | struct btrfs_free_space *ret; | ||
457 | |||
458 | mutex_lock(&block_group->alloc_mutex); | ||
459 | ret = tree_search_offset(&block_group->free_space_offset, offset, | ||
460 | bytes, 0); | ||
461 | mutex_unlock(&block_group->alloc_mutex); | ||
462 | |||
463 | return ret; | ||
464 | } | ||
465 | |||
466 | static struct btrfs_free_space *btrfs_find_free_space_bytes(struct | ||
467 | btrfs_block_group_cache | ||
468 | *block_group, u64 offset, | ||
469 | u64 bytes) | ||
470 | { | ||
471 | struct btrfs_free_space *ret; | ||
472 | |||
473 | mutex_lock(&block_group->alloc_mutex); | ||
474 | |||
475 | ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); | ||
476 | mutex_unlock(&block_group->alloc_mutex); | ||
477 | |||
478 | return ret; | ||
479 | } | ||
480 | #endif | ||
481 | |||
482 | struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache | ||
483 | *block_group, u64 offset, | ||
484 | u64 bytes) | ||
485 | { | ||
486 | struct btrfs_free_space *ret = NULL; | ||
487 | |||
488 | ret = tree_search_offset(&block_group->free_space_offset, offset, | ||
489 | bytes, 0); | ||
490 | if (!ret) | ||
491 | ret = tree_search_bytes(&block_group->free_space_bytes, | ||
492 | offset, bytes); | ||
493 | |||
494 | return ret; | ||
495 | } | ||
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 000000000000..2a020b276768 --- /dev/null +++ b/fs/btrfs/hash.h | |||
@@ -0,0 +1,27 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __HASH__ | ||
20 | #define __HASH__ | ||
21 | |||
22 | #include "crc32c.h" | ||
23 | static inline u64 btrfs_name_hash(const char *name, int len) | ||
24 | { | ||
25 | return btrfs_crc32c((u32)~1, name, len); | ||
26 | } | ||
27 | #endif | ||
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 000000000000..3d46fa1f29a4 --- /dev/null +++ b/fs/btrfs/inode-item.c | |||
@@ -0,0 +1,206 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | #include "transaction.h" | ||
22 | |||
23 | static int find_name_in_backref(struct btrfs_path *path, const char *name, | ||
24 | int name_len, struct btrfs_inode_ref **ref_ret) | ||
25 | { | ||
26 | struct extent_buffer *leaf; | ||
27 | struct btrfs_inode_ref *ref; | ||
28 | unsigned long ptr; | ||
29 | unsigned long name_ptr; | ||
30 | u32 item_size; | ||
31 | u32 cur_offset = 0; | ||
32 | int len; | ||
33 | |||
34 | leaf = path->nodes[0]; | ||
35 | item_size = btrfs_item_size_nr(leaf, path->slots[0]); | ||
36 | ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
37 | while (cur_offset < item_size) { | ||
38 | ref = (struct btrfs_inode_ref *)(ptr + cur_offset); | ||
39 | len = btrfs_inode_ref_name_len(leaf, ref); | ||
40 | name_ptr = (unsigned long)(ref + 1); | ||
41 | cur_offset += len + sizeof(*ref); | ||
42 | if (len != name_len) | ||
43 | continue; | ||
44 | if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { | ||
45 | *ref_ret = ref; | ||
46 | return 1; | ||
47 | } | ||
48 | } | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, | ||
53 | struct btrfs_root *root, | ||
54 | const char *name, int name_len, | ||
55 | u64 inode_objectid, u64 ref_objectid, u64 *index) | ||
56 | { | ||
57 | struct btrfs_path *path; | ||
58 | struct btrfs_key key; | ||
59 | struct btrfs_inode_ref *ref; | ||
60 | struct extent_buffer *leaf; | ||
61 | unsigned long ptr; | ||
62 | unsigned long item_start; | ||
63 | u32 item_size; | ||
64 | u32 sub_item_len; | ||
65 | int ret; | ||
66 | int del_len = name_len + sizeof(*ref); | ||
67 | |||
68 | key.objectid = inode_objectid; | ||
69 | key.offset = ref_objectid; | ||
70 | btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); | ||
71 | |||
72 | path = btrfs_alloc_path(); | ||
73 | if (!path) | ||
74 | return -ENOMEM; | ||
75 | |||
76 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
77 | if (ret > 0) { | ||
78 | ret = -ENOENT; | ||
79 | goto out; | ||
80 | } else if (ret < 0) { | ||
81 | goto out; | ||
82 | } | ||
83 | if (!find_name_in_backref(path, name, name_len, &ref)) { | ||
84 | ret = -ENOENT; | ||
85 | goto out; | ||
86 | } | ||
87 | leaf = path->nodes[0]; | ||
88 | item_size = btrfs_item_size_nr(leaf, path->slots[0]); | ||
89 | |||
90 | if (index) | ||
91 | *index = btrfs_inode_ref_index(leaf, ref); | ||
92 | |||
93 | if (del_len == item_size) { | ||
94 | ret = btrfs_del_item(trans, root, path); | ||
95 | goto out; | ||
96 | } | ||
97 | ptr = (unsigned long)ref; | ||
98 | sub_item_len = name_len + sizeof(*ref); | ||
99 | item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
100 | memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, | ||
101 | item_size - (ptr + sub_item_len - item_start)); | ||
102 | ret = btrfs_truncate_item(trans, root, path, | ||
103 | item_size - sub_item_len, 1); | ||
104 | BUG_ON(ret); | ||
105 | out: | ||
106 | btrfs_free_path(path); | ||
107 | return ret; | ||
108 | } | ||
109 | |||
110 | int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, | ||
111 | struct btrfs_root *root, | ||
112 | const char *name, int name_len, | ||
113 | u64 inode_objectid, u64 ref_objectid, u64 index) | ||
114 | { | ||
115 | struct btrfs_path *path; | ||
116 | struct btrfs_key key; | ||
117 | struct btrfs_inode_ref *ref; | ||
118 | unsigned long ptr; | ||
119 | int ret; | ||
120 | int ins_len = name_len + sizeof(*ref); | ||
121 | |||
122 | key.objectid = inode_objectid; | ||
123 | key.offset = ref_objectid; | ||
124 | btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); | ||
125 | |||
126 | path = btrfs_alloc_path(); | ||
127 | if (!path) | ||
128 | return -ENOMEM; | ||
129 | |||
130 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
131 | ins_len); | ||
132 | if (ret == -EEXIST) { | ||
133 | u32 old_size; | ||
134 | |||
135 | if (find_name_in_backref(path, name, name_len, &ref)) | ||
136 | goto out; | ||
137 | |||
138 | old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); | ||
139 | ret = btrfs_extend_item(trans, root, path, ins_len); | ||
140 | BUG_ON(ret); | ||
141 | ref = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
142 | struct btrfs_inode_ref); | ||
143 | ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); | ||
144 | btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); | ||
145 | btrfs_set_inode_ref_index(path->nodes[0], ref, index); | ||
146 | ptr = (unsigned long)(ref + 1); | ||
147 | ret = 0; | ||
148 | } else if (ret < 0) { | ||
149 | goto out; | ||
150 | } else { | ||
151 | ref = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
152 | struct btrfs_inode_ref); | ||
153 | btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); | ||
154 | btrfs_set_inode_ref_index(path->nodes[0], ref, index); | ||
155 | ptr = (unsigned long)(ref + 1); | ||
156 | } | ||
157 | write_extent_buffer(path->nodes[0], name, ptr, name_len); | ||
158 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
159 | |||
160 | out: | ||
161 | btrfs_free_path(path); | ||
162 | return ret; | ||
163 | } | ||
164 | |||
165 | int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, | ||
166 | struct btrfs_root *root, | ||
167 | struct btrfs_path *path, u64 objectid) | ||
168 | { | ||
169 | struct btrfs_key key; | ||
170 | int ret; | ||
171 | key.objectid = objectid; | ||
172 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
173 | key.offset = 0; | ||
174 | |||
175 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
176 | sizeof(struct btrfs_inode_item)); | ||
177 | if (ret == 0 && objectid > root->highest_inode) | ||
178 | root->highest_inode = objectid; | ||
179 | return ret; | ||
180 | } | ||
181 | |||
182 | int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root | ||
183 | *root, struct btrfs_path *path, | ||
184 | struct btrfs_key *location, int mod) | ||
185 | { | ||
186 | int ins_len = mod < 0 ? -1 : 0; | ||
187 | int cow = mod != 0; | ||
188 | int ret; | ||
189 | int slot; | ||
190 | struct extent_buffer *leaf; | ||
191 | struct btrfs_key found_key; | ||
192 | |||
193 | ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); | ||
194 | if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && | ||
195 | location->offset == (u64)-1 && path->slots[0] != 0) { | ||
196 | slot = path->slots[0] - 1; | ||
197 | leaf = path->nodes[0]; | ||
198 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
199 | if (found_key.objectid == location->objectid && | ||
200 | btrfs_key_type(&found_key) == btrfs_key_type(location)) { | ||
201 | path->slots[0]--; | ||
202 | return 0; | ||
203 | } | ||
204 | } | ||
205 | return ret; | ||
206 | } | ||
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 000000000000..2aa79873eb46 --- /dev/null +++ b/fs/btrfs/inode-map.c | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | #include "transaction.h" | ||
22 | |||
23 | int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) | ||
24 | { | ||
25 | struct btrfs_path *path; | ||
26 | int ret; | ||
27 | struct extent_buffer *l; | ||
28 | struct btrfs_key search_key; | ||
29 | struct btrfs_key found_key; | ||
30 | int slot; | ||
31 | |||
32 | path = btrfs_alloc_path(); | ||
33 | BUG_ON(!path); | ||
34 | |||
35 | search_key.objectid = BTRFS_LAST_FREE_OBJECTID; | ||
36 | search_key.type = -1; | ||
37 | search_key.offset = (u64)-1; | ||
38 | ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); | ||
39 | if (ret < 0) | ||
40 | goto error; | ||
41 | BUG_ON(ret == 0); | ||
42 | if (path->slots[0] > 0) { | ||
43 | slot = path->slots[0] - 1; | ||
44 | l = path->nodes[0]; | ||
45 | btrfs_item_key_to_cpu(l, &found_key, slot); | ||
46 | *objectid = found_key.objectid; | ||
47 | } else { | ||
48 | *objectid = BTRFS_FIRST_FREE_OBJECTID; | ||
49 | } | ||
50 | ret = 0; | ||
51 | error: | ||
52 | btrfs_free_path(path); | ||
53 | return ret; | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * walks the btree of allocated inodes and find a hole. | ||
58 | */ | ||
59 | int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, | ||
60 | struct btrfs_root *root, | ||
61 | u64 dirid, u64 *objectid) | ||
62 | { | ||
63 | struct btrfs_path *path; | ||
64 | struct btrfs_key key; | ||
65 | int ret; | ||
66 | int slot = 0; | ||
67 | u64 last_ino = 0; | ||
68 | int start_found; | ||
69 | struct extent_buffer *l; | ||
70 | struct btrfs_key search_key; | ||
71 | u64 search_start = dirid; | ||
72 | |||
73 | mutex_lock(&root->objectid_mutex); | ||
74 | if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && | ||
75 | root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { | ||
76 | *objectid = ++root->last_inode_alloc; | ||
77 | mutex_unlock(&root->objectid_mutex); | ||
78 | return 0; | ||
79 | } | ||
80 | path = btrfs_alloc_path(); | ||
81 | BUG_ON(!path); | ||
82 | search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); | ||
83 | search_key.objectid = search_start; | ||
84 | search_key.type = 0; | ||
85 | search_key.offset = 0; | ||
86 | |||
87 | btrfs_init_path(path); | ||
88 | start_found = 0; | ||
89 | ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); | ||
90 | if (ret < 0) | ||
91 | goto error; | ||
92 | |||
93 | while (1) { | ||
94 | l = path->nodes[0]; | ||
95 | slot = path->slots[0]; | ||
96 | if (slot >= btrfs_header_nritems(l)) { | ||
97 | ret = btrfs_next_leaf(root, path); | ||
98 | if (ret == 0) | ||
99 | continue; | ||
100 | if (ret < 0) | ||
101 | goto error; | ||
102 | if (!start_found) { | ||
103 | *objectid = search_start; | ||
104 | start_found = 1; | ||
105 | goto found; | ||
106 | } | ||
107 | *objectid = last_ino > search_start ? | ||
108 | last_ino : search_start; | ||
109 | goto found; | ||
110 | } | ||
111 | btrfs_item_key_to_cpu(l, &key, slot); | ||
112 | if (key.objectid >= search_start) { | ||
113 | if (start_found) { | ||
114 | if (last_ino < search_start) | ||
115 | last_ino = search_start; | ||
116 | if (key.objectid > last_ino) { | ||
117 | *objectid = last_ino; | ||
118 | goto found; | ||
119 | } | ||
120 | } else if (key.objectid > search_start) { | ||
121 | *objectid = search_start; | ||
122 | goto found; | ||
123 | } | ||
124 | } | ||
125 | if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) | ||
126 | break; | ||
127 | |||
128 | start_found = 1; | ||
129 | last_ino = key.objectid + 1; | ||
130 | path->slots[0]++; | ||
131 | } | ||
132 | BUG_ON(1); | ||
133 | found: | ||
134 | btrfs_release_path(root, path); | ||
135 | btrfs_free_path(path); | ||
136 | BUG_ON(*objectid < search_start); | ||
137 | mutex_unlock(&root->objectid_mutex); | ||
138 | return 0; | ||
139 | error: | ||
140 | btrfs_release_path(root, path); | ||
141 | btrfs_free_path(path); | ||
142 | mutex_unlock(&root->objectid_mutex); | ||
143 | return ret; | ||
144 | } | ||
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 000000000000..8adfe059ab41 --- /dev/null +++ b/fs/btrfs/inode.c | |||
@@ -0,0 +1,5035 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/bio.h> | ||
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/file.h> | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/pagemap.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <linux/time.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/backing-dev.h> | ||
31 | #include <linux/mpage.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/writeback.h> | ||
34 | #include <linux/statfs.h> | ||
35 | #include <linux/compat.h> | ||
36 | #include <linux/bit_spinlock.h> | ||
37 | #include <linux/version.h> | ||
38 | #include <linux/xattr.h> | ||
39 | #include <linux/posix_acl.h> | ||
40 | #include <linux/falloc.h> | ||
41 | #include "compat.h" | ||
42 | #include "ctree.h" | ||
43 | #include "disk-io.h" | ||
44 | #include "transaction.h" | ||
45 | #include "btrfs_inode.h" | ||
46 | #include "ioctl.h" | ||
47 | #include "print-tree.h" | ||
48 | #include "volumes.h" | ||
49 | #include "ordered-data.h" | ||
50 | #include "xattr.h" | ||
51 | #include "tree-log.h" | ||
52 | #include "ref-cache.h" | ||
53 | #include "compression.h" | ||
54 | |||
55 | struct btrfs_iget_args { | ||
56 | u64 ino; | ||
57 | struct btrfs_root *root; | ||
58 | }; | ||
59 | |||
60 | static struct inode_operations btrfs_dir_inode_operations; | ||
61 | static struct inode_operations btrfs_symlink_inode_operations; | ||
62 | static struct inode_operations btrfs_dir_ro_inode_operations; | ||
63 | static struct inode_operations btrfs_special_inode_operations; | ||
64 | static struct inode_operations btrfs_file_inode_operations; | ||
65 | static struct address_space_operations btrfs_aops; | ||
66 | static struct address_space_operations btrfs_symlink_aops; | ||
67 | static struct file_operations btrfs_dir_file_operations; | ||
68 | static struct extent_io_ops btrfs_extent_io_ops; | ||
69 | |||
70 | static struct kmem_cache *btrfs_inode_cachep; | ||
71 | struct kmem_cache *btrfs_trans_handle_cachep; | ||
72 | struct kmem_cache *btrfs_transaction_cachep; | ||
73 | struct kmem_cache *btrfs_bit_radix_cachep; | ||
74 | struct kmem_cache *btrfs_path_cachep; | ||
75 | |||
76 | #define S_SHIFT 12 | ||
77 | static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { | ||
78 | [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, | ||
79 | [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, | ||
80 | [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, | ||
81 | [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, | ||
82 | [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, | ||
83 | [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, | ||
84 | [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, | ||
85 | }; | ||
86 | |||
87 | static void btrfs_truncate(struct inode *inode); | ||
88 | static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); | ||
89 | static noinline int cow_file_range(struct inode *inode, | ||
90 | struct page *locked_page, | ||
91 | u64 start, u64 end, int *page_started, | ||
92 | unsigned long *nr_written, int unlock); | ||
93 | |||
94 | /* | ||
95 | * a very lame attempt at stopping writes when the FS is 85% full. There | ||
96 | * are countless ways this is incorrect, but it is better than nothing. | ||
97 | */ | ||
98 | int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, | ||
99 | int for_del) | ||
100 | { | ||
101 | u64 total; | ||
102 | u64 used; | ||
103 | u64 thresh; | ||
104 | int ret = 0; | ||
105 | |||
106 | spin_lock(&root->fs_info->delalloc_lock); | ||
107 | total = btrfs_super_total_bytes(&root->fs_info->super_copy); | ||
108 | used = btrfs_super_bytes_used(&root->fs_info->super_copy); | ||
109 | if (for_del) | ||
110 | thresh = total * 90; | ||
111 | else | ||
112 | thresh = total * 85; | ||
113 | |||
114 | do_div(thresh, 100); | ||
115 | |||
116 | if (used + root->fs_info->delalloc_bytes + num_required > thresh) | ||
117 | ret = -ENOSPC; | ||
118 | spin_unlock(&root->fs_info->delalloc_lock); | ||
119 | return ret; | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * this does all the hard work for inserting an inline extent into | ||
124 | * the btree. The caller should have done a btrfs_drop_extents so that | ||
125 | * no overlapping inline items exist in the btree | ||
126 | */ | ||
127 | static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, | ||
128 | struct btrfs_root *root, struct inode *inode, | ||
129 | u64 start, size_t size, size_t compressed_size, | ||
130 | struct page **compressed_pages) | ||
131 | { | ||
132 | struct btrfs_key key; | ||
133 | struct btrfs_path *path; | ||
134 | struct extent_buffer *leaf; | ||
135 | struct page *page = NULL; | ||
136 | char *kaddr; | ||
137 | unsigned long ptr; | ||
138 | struct btrfs_file_extent_item *ei; | ||
139 | int err = 0; | ||
140 | int ret; | ||
141 | size_t cur_size = size; | ||
142 | size_t datasize; | ||
143 | unsigned long offset; | ||
144 | int use_compress = 0; | ||
145 | |||
146 | if (compressed_size && compressed_pages) { | ||
147 | use_compress = 1; | ||
148 | cur_size = compressed_size; | ||
149 | } | ||
150 | |||
151 | path = btrfs_alloc_path(); | ||
152 | if (!path) | ||
153 | return -ENOMEM; | ||
154 | |||
155 | btrfs_set_trans_block_group(trans, inode); | ||
156 | |||
157 | key.objectid = inode->i_ino; | ||
158 | key.offset = start; | ||
159 | btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); | ||
160 | datasize = btrfs_file_extent_calc_inline_size(cur_size); | ||
161 | |||
162 | inode_add_bytes(inode, size); | ||
163 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
164 | datasize); | ||
165 | BUG_ON(ret); | ||
166 | if (ret) { | ||
167 | err = ret; | ||
168 | goto fail; | ||
169 | } | ||
170 | leaf = path->nodes[0]; | ||
171 | ei = btrfs_item_ptr(leaf, path->slots[0], | ||
172 | struct btrfs_file_extent_item); | ||
173 | btrfs_set_file_extent_generation(leaf, ei, trans->transid); | ||
174 | btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); | ||
175 | btrfs_set_file_extent_encryption(leaf, ei, 0); | ||
176 | btrfs_set_file_extent_other_encoding(leaf, ei, 0); | ||
177 | btrfs_set_file_extent_ram_bytes(leaf, ei, size); | ||
178 | ptr = btrfs_file_extent_inline_start(ei); | ||
179 | |||
180 | if (use_compress) { | ||
181 | struct page *cpage; | ||
182 | int i = 0; | ||
183 | while (compressed_size > 0) { | ||
184 | cpage = compressed_pages[i]; | ||
185 | cur_size = min_t(unsigned long, compressed_size, | ||
186 | PAGE_CACHE_SIZE); | ||
187 | |||
188 | kaddr = kmap(cpage); | ||
189 | write_extent_buffer(leaf, kaddr, ptr, cur_size); | ||
190 | kunmap(cpage); | ||
191 | |||
192 | i++; | ||
193 | ptr += cur_size; | ||
194 | compressed_size -= cur_size; | ||
195 | } | ||
196 | btrfs_set_file_extent_compression(leaf, ei, | ||
197 | BTRFS_COMPRESS_ZLIB); | ||
198 | } else { | ||
199 | page = find_get_page(inode->i_mapping, | ||
200 | start >> PAGE_CACHE_SHIFT); | ||
201 | btrfs_set_file_extent_compression(leaf, ei, 0); | ||
202 | kaddr = kmap_atomic(page, KM_USER0); | ||
203 | offset = start & (PAGE_CACHE_SIZE - 1); | ||
204 | write_extent_buffer(leaf, kaddr + offset, ptr, size); | ||
205 | kunmap_atomic(kaddr, KM_USER0); | ||
206 | page_cache_release(page); | ||
207 | } | ||
208 | btrfs_mark_buffer_dirty(leaf); | ||
209 | btrfs_free_path(path); | ||
210 | |||
211 | BTRFS_I(inode)->disk_i_size = inode->i_size; | ||
212 | btrfs_update_inode(trans, root, inode); | ||
213 | return 0; | ||
214 | fail: | ||
215 | btrfs_free_path(path); | ||
216 | return err; | ||
217 | } | ||
218 | |||
219 | |||
220 | /* | ||
221 | * conditionally insert an inline extent into the file. This | ||
222 | * does the checks required to make sure the data is small enough | ||
223 | * to fit as an inline extent. | ||
224 | */ | ||
225 | static int cow_file_range_inline(struct btrfs_trans_handle *trans, | ||
226 | struct btrfs_root *root, | ||
227 | struct inode *inode, u64 start, u64 end, | ||
228 | size_t compressed_size, | ||
229 | struct page **compressed_pages) | ||
230 | { | ||
231 | u64 isize = i_size_read(inode); | ||
232 | u64 actual_end = min(end + 1, isize); | ||
233 | u64 inline_len = actual_end - start; | ||
234 | u64 aligned_end = (end + root->sectorsize - 1) & | ||
235 | ~((u64)root->sectorsize - 1); | ||
236 | u64 hint_byte; | ||
237 | u64 data_len = inline_len; | ||
238 | int ret; | ||
239 | |||
240 | if (compressed_size) | ||
241 | data_len = compressed_size; | ||
242 | |||
243 | if (start > 0 || | ||
244 | actual_end >= PAGE_CACHE_SIZE || | ||
245 | data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || | ||
246 | (!compressed_size && | ||
247 | (actual_end & (root->sectorsize - 1)) == 0) || | ||
248 | end + 1 < isize || | ||
249 | data_len > root->fs_info->max_inline) { | ||
250 | return 1; | ||
251 | } | ||
252 | |||
253 | ret = btrfs_drop_extents(trans, root, inode, start, | ||
254 | aligned_end, start, &hint_byte); | ||
255 | BUG_ON(ret); | ||
256 | |||
257 | if (isize > actual_end) | ||
258 | inline_len = min_t(u64, isize, actual_end); | ||
259 | ret = insert_inline_extent(trans, root, inode, start, | ||
260 | inline_len, compressed_size, | ||
261 | compressed_pages); | ||
262 | BUG_ON(ret); | ||
263 | btrfs_drop_extent_cache(inode, start, aligned_end, 0); | ||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | struct async_extent { | ||
268 | u64 start; | ||
269 | u64 ram_size; | ||
270 | u64 compressed_size; | ||
271 | struct page **pages; | ||
272 | unsigned long nr_pages; | ||
273 | struct list_head list; | ||
274 | }; | ||
275 | |||
276 | struct async_cow { | ||
277 | struct inode *inode; | ||
278 | struct btrfs_root *root; | ||
279 | struct page *locked_page; | ||
280 | u64 start; | ||
281 | u64 end; | ||
282 | struct list_head extents; | ||
283 | struct btrfs_work work; | ||
284 | }; | ||
285 | |||
286 | static noinline int add_async_extent(struct async_cow *cow, | ||
287 | u64 start, u64 ram_size, | ||
288 | u64 compressed_size, | ||
289 | struct page **pages, | ||
290 | unsigned long nr_pages) | ||
291 | { | ||
292 | struct async_extent *async_extent; | ||
293 | |||
294 | async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); | ||
295 | async_extent->start = start; | ||
296 | async_extent->ram_size = ram_size; | ||
297 | async_extent->compressed_size = compressed_size; | ||
298 | async_extent->pages = pages; | ||
299 | async_extent->nr_pages = nr_pages; | ||
300 | list_add_tail(&async_extent->list, &cow->extents); | ||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * we create compressed extents in two phases. The first | ||
306 | * phase compresses a range of pages that have already been | ||
307 | * locked (both pages and state bits are locked). | ||
308 | * | ||
309 | * This is done inside an ordered work queue, and the compression | ||
310 | * is spread across many cpus. The actual IO submission is step | ||
311 | * two, and the ordered work queue takes care of making sure that | ||
312 | * happens in the same order things were put onto the queue by | ||
313 | * writepages and friends. | ||
314 | * | ||
315 | * If this code finds it can't get good compression, it puts an | ||
316 | * entry onto the work queue to write the uncompressed bytes. This | ||
317 | * makes sure that both compressed inodes and uncompressed inodes | ||
318 | * are written in the same order that pdflush sent them down. | ||
319 | */ | ||
320 | static noinline int compress_file_range(struct inode *inode, | ||
321 | struct page *locked_page, | ||
322 | u64 start, u64 end, | ||
323 | struct async_cow *async_cow, | ||
324 | int *num_added) | ||
325 | { | ||
326 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
327 | struct btrfs_trans_handle *trans; | ||
328 | u64 num_bytes; | ||
329 | u64 orig_start; | ||
330 | u64 disk_num_bytes; | ||
331 | u64 blocksize = root->sectorsize; | ||
332 | u64 actual_end; | ||
333 | u64 isize = i_size_read(inode); | ||
334 | int ret = 0; | ||
335 | struct page **pages = NULL; | ||
336 | unsigned long nr_pages; | ||
337 | unsigned long nr_pages_ret = 0; | ||
338 | unsigned long total_compressed = 0; | ||
339 | unsigned long total_in = 0; | ||
340 | unsigned long max_compressed = 128 * 1024; | ||
341 | unsigned long max_uncompressed = 128 * 1024; | ||
342 | int i; | ||
343 | int will_compress; | ||
344 | |||
345 | orig_start = start; | ||
346 | |||
347 | actual_end = min_t(u64, isize, end + 1); | ||
348 | again: | ||
349 | will_compress = 0; | ||
350 | nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; | ||
351 | nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); | ||
352 | |||
353 | total_compressed = actual_end - start; | ||
354 | |||
355 | /* we want to make sure that amount of ram required to uncompress | ||
356 | * an extent is reasonable, so we limit the total size in ram | ||
357 | * of a compressed extent to 128k. This is a crucial number | ||
358 | * because it also controls how easily we can spread reads across | ||
359 | * cpus for decompression. | ||
360 | * | ||
361 | * We also want to make sure the amount of IO required to do | ||
362 | * a random read is reasonably small, so we limit the size of | ||
363 | * a compressed extent to 128k. | ||
364 | */ | ||
365 | total_compressed = min(total_compressed, max_uncompressed); | ||
366 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | ||
367 | num_bytes = max(blocksize, num_bytes); | ||
368 | disk_num_bytes = num_bytes; | ||
369 | total_in = 0; | ||
370 | ret = 0; | ||
371 | |||
372 | /* | ||
373 | * we do compression for mount -o compress and when the | ||
374 | * inode has not been flagged as nocompress. This flag can | ||
375 | * change at any time if we discover bad compression ratios. | ||
376 | */ | ||
377 | if (!btrfs_test_flag(inode, NOCOMPRESS) && | ||
378 | btrfs_test_opt(root, COMPRESS)) { | ||
379 | WARN_ON(pages); | ||
380 | pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); | ||
381 | |||
382 | ret = btrfs_zlib_compress_pages(inode->i_mapping, start, | ||
383 | total_compressed, pages, | ||
384 | nr_pages, &nr_pages_ret, | ||
385 | &total_in, | ||
386 | &total_compressed, | ||
387 | max_compressed); | ||
388 | |||
389 | if (!ret) { | ||
390 | unsigned long offset = total_compressed & | ||
391 | (PAGE_CACHE_SIZE - 1); | ||
392 | struct page *page = pages[nr_pages_ret - 1]; | ||
393 | char *kaddr; | ||
394 | |||
395 | /* zero the tail end of the last page, we might be | ||
396 | * sending it down to disk | ||
397 | */ | ||
398 | if (offset) { | ||
399 | kaddr = kmap_atomic(page, KM_USER0); | ||
400 | memset(kaddr + offset, 0, | ||
401 | PAGE_CACHE_SIZE - offset); | ||
402 | kunmap_atomic(kaddr, KM_USER0); | ||
403 | } | ||
404 | will_compress = 1; | ||
405 | } | ||
406 | } | ||
407 | if (start == 0) { | ||
408 | trans = btrfs_join_transaction(root, 1); | ||
409 | BUG_ON(!trans); | ||
410 | btrfs_set_trans_block_group(trans, inode); | ||
411 | |||
412 | /* lets try to make an inline extent */ | ||
413 | if (ret || total_in < (actual_end - start)) { | ||
414 | /* we didn't compress the entire range, try | ||
415 | * to make an uncompressed inline extent. | ||
416 | */ | ||
417 | ret = cow_file_range_inline(trans, root, inode, | ||
418 | start, end, 0, NULL); | ||
419 | } else { | ||
420 | /* try making a compressed inline extent */ | ||
421 | ret = cow_file_range_inline(trans, root, inode, | ||
422 | start, end, | ||
423 | total_compressed, pages); | ||
424 | } | ||
425 | btrfs_end_transaction(trans, root); | ||
426 | if (ret == 0) { | ||
427 | /* | ||
428 | * inline extent creation worked, we don't need | ||
429 | * to create any more async work items. Unlock | ||
430 | * and free up our temp pages. | ||
431 | */ | ||
432 | extent_clear_unlock_delalloc(inode, | ||
433 | &BTRFS_I(inode)->io_tree, | ||
434 | start, end, NULL, 1, 0, | ||
435 | 0, 1, 1, 1); | ||
436 | ret = 0; | ||
437 | goto free_pages_out; | ||
438 | } | ||
439 | } | ||
440 | |||
441 | if (will_compress) { | ||
442 | /* | ||
443 | * we aren't doing an inline extent round the compressed size | ||
444 | * up to a block size boundary so the allocator does sane | ||
445 | * things | ||
446 | */ | ||
447 | total_compressed = (total_compressed + blocksize - 1) & | ||
448 | ~(blocksize - 1); | ||
449 | |||
450 | /* | ||
451 | * one last check to make sure the compression is really a | ||
452 | * win, compare the page count read with the blocks on disk | ||
453 | */ | ||
454 | total_in = (total_in + PAGE_CACHE_SIZE - 1) & | ||
455 | ~(PAGE_CACHE_SIZE - 1); | ||
456 | if (total_compressed >= total_in) { | ||
457 | will_compress = 0; | ||
458 | } else { | ||
459 | disk_num_bytes = total_compressed; | ||
460 | num_bytes = total_in; | ||
461 | } | ||
462 | } | ||
463 | if (!will_compress && pages) { | ||
464 | /* | ||
465 | * the compression code ran but failed to make things smaller, | ||
466 | * free any pages it allocated and our page pointer array | ||
467 | */ | ||
468 | for (i = 0; i < nr_pages_ret; i++) { | ||
469 | WARN_ON(pages[i]->mapping); | ||
470 | page_cache_release(pages[i]); | ||
471 | } | ||
472 | kfree(pages); | ||
473 | pages = NULL; | ||
474 | total_compressed = 0; | ||
475 | nr_pages_ret = 0; | ||
476 | |||
477 | /* flag the file so we don't compress in the future */ | ||
478 | btrfs_set_flag(inode, NOCOMPRESS); | ||
479 | } | ||
480 | if (will_compress) { | ||
481 | *num_added += 1; | ||
482 | |||
483 | /* the async work queues will take care of doing actual | ||
484 | * allocation on disk for these compressed pages, | ||
485 | * and will submit them to the elevator. | ||
486 | */ | ||
487 | add_async_extent(async_cow, start, num_bytes, | ||
488 | total_compressed, pages, nr_pages_ret); | ||
489 | |||
490 | if (start + num_bytes < end && start + num_bytes < actual_end) { | ||
491 | start += num_bytes; | ||
492 | pages = NULL; | ||
493 | cond_resched(); | ||
494 | goto again; | ||
495 | } | ||
496 | } else { | ||
497 | /* | ||
498 | * No compression, but we still need to write the pages in | ||
499 | * the file we've been given so far. redirty the locked | ||
500 | * page if it corresponds to our extent and set things up | ||
501 | * for the async work queue to run cow_file_range to do | ||
502 | * the normal delalloc dance | ||
503 | */ | ||
504 | if (page_offset(locked_page) >= start && | ||
505 | page_offset(locked_page) <= end) { | ||
506 | __set_page_dirty_nobuffers(locked_page); | ||
507 | /* unlocked later on in the async handlers */ | ||
508 | } | ||
509 | add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); | ||
510 | *num_added += 1; | ||
511 | } | ||
512 | |||
513 | out: | ||
514 | return 0; | ||
515 | |||
516 | free_pages_out: | ||
517 | for (i = 0; i < nr_pages_ret; i++) { | ||
518 | WARN_ON(pages[i]->mapping); | ||
519 | page_cache_release(pages[i]); | ||
520 | } | ||
521 | kfree(pages); | ||
522 | |||
523 | goto out; | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * phase two of compressed writeback. This is the ordered portion | ||
528 | * of the code, which only gets called in the order the work was | ||
529 | * queued. We walk all the async extents created by compress_file_range | ||
530 | * and send them down to the disk. | ||
531 | */ | ||
532 | static noinline int submit_compressed_extents(struct inode *inode, | ||
533 | struct async_cow *async_cow) | ||
534 | { | ||
535 | struct async_extent *async_extent; | ||
536 | u64 alloc_hint = 0; | ||
537 | struct btrfs_trans_handle *trans; | ||
538 | struct btrfs_key ins; | ||
539 | struct extent_map *em; | ||
540 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
541 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
542 | struct extent_io_tree *io_tree; | ||
543 | int ret; | ||
544 | |||
545 | if (list_empty(&async_cow->extents)) | ||
546 | return 0; | ||
547 | |||
548 | trans = btrfs_join_transaction(root, 1); | ||
549 | |||
550 | while (!list_empty(&async_cow->extents)) { | ||
551 | async_extent = list_entry(async_cow->extents.next, | ||
552 | struct async_extent, list); | ||
553 | list_del(&async_extent->list); | ||
554 | |||
555 | io_tree = &BTRFS_I(inode)->io_tree; | ||
556 | |||
557 | /* did the compression code fall back to uncompressed IO? */ | ||
558 | if (!async_extent->pages) { | ||
559 | int page_started = 0; | ||
560 | unsigned long nr_written = 0; | ||
561 | |||
562 | lock_extent(io_tree, async_extent->start, | ||
563 | async_extent->start + | ||
564 | async_extent->ram_size - 1, GFP_NOFS); | ||
565 | |||
566 | /* allocate blocks */ | ||
567 | cow_file_range(inode, async_cow->locked_page, | ||
568 | async_extent->start, | ||
569 | async_extent->start + | ||
570 | async_extent->ram_size - 1, | ||
571 | &page_started, &nr_written, 0); | ||
572 | |||
573 | /* | ||
574 | * if page_started, cow_file_range inserted an | ||
575 | * inline extent and took care of all the unlocking | ||
576 | * and IO for us. Otherwise, we need to submit | ||
577 | * all those pages down to the drive. | ||
578 | */ | ||
579 | if (!page_started) | ||
580 | extent_write_locked_range(io_tree, | ||
581 | inode, async_extent->start, | ||
582 | async_extent->start + | ||
583 | async_extent->ram_size - 1, | ||
584 | btrfs_get_extent, | ||
585 | WB_SYNC_ALL); | ||
586 | kfree(async_extent); | ||
587 | cond_resched(); | ||
588 | continue; | ||
589 | } | ||
590 | |||
591 | lock_extent(io_tree, async_extent->start, | ||
592 | async_extent->start + async_extent->ram_size - 1, | ||
593 | GFP_NOFS); | ||
594 | /* | ||
595 | * here we're doing allocation and writeback of the | ||
596 | * compressed pages | ||
597 | */ | ||
598 | btrfs_drop_extent_cache(inode, async_extent->start, | ||
599 | async_extent->start + | ||
600 | async_extent->ram_size - 1, 0); | ||
601 | |||
602 | ret = btrfs_reserve_extent(trans, root, | ||
603 | async_extent->compressed_size, | ||
604 | async_extent->compressed_size, | ||
605 | 0, alloc_hint, | ||
606 | (u64)-1, &ins, 1); | ||
607 | BUG_ON(ret); | ||
608 | em = alloc_extent_map(GFP_NOFS); | ||
609 | em->start = async_extent->start; | ||
610 | em->len = async_extent->ram_size; | ||
611 | em->orig_start = em->start; | ||
612 | |||
613 | em->block_start = ins.objectid; | ||
614 | em->block_len = ins.offset; | ||
615 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
616 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
617 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
618 | |||
619 | while (1) { | ||
620 | spin_lock(&em_tree->lock); | ||
621 | ret = add_extent_mapping(em_tree, em); | ||
622 | spin_unlock(&em_tree->lock); | ||
623 | if (ret != -EEXIST) { | ||
624 | free_extent_map(em); | ||
625 | break; | ||
626 | } | ||
627 | btrfs_drop_extent_cache(inode, async_extent->start, | ||
628 | async_extent->start + | ||
629 | async_extent->ram_size - 1, 0); | ||
630 | } | ||
631 | |||
632 | ret = btrfs_add_ordered_extent(inode, async_extent->start, | ||
633 | ins.objectid, | ||
634 | async_extent->ram_size, | ||
635 | ins.offset, | ||
636 | BTRFS_ORDERED_COMPRESSED); | ||
637 | BUG_ON(ret); | ||
638 | |||
639 | btrfs_end_transaction(trans, root); | ||
640 | |||
641 | /* | ||
642 | * clear dirty, set writeback and unlock the pages. | ||
643 | */ | ||
644 | extent_clear_unlock_delalloc(inode, | ||
645 | &BTRFS_I(inode)->io_tree, | ||
646 | async_extent->start, | ||
647 | async_extent->start + | ||
648 | async_extent->ram_size - 1, | ||
649 | NULL, 1, 1, 0, 1, 1, 0); | ||
650 | |||
651 | ret = btrfs_submit_compressed_write(inode, | ||
652 | async_extent->start, | ||
653 | async_extent->ram_size, | ||
654 | ins.objectid, | ||
655 | ins.offset, async_extent->pages, | ||
656 | async_extent->nr_pages); | ||
657 | |||
658 | BUG_ON(ret); | ||
659 | trans = btrfs_join_transaction(root, 1); | ||
660 | alloc_hint = ins.objectid + ins.offset; | ||
661 | kfree(async_extent); | ||
662 | cond_resched(); | ||
663 | } | ||
664 | |||
665 | btrfs_end_transaction(trans, root); | ||
666 | return 0; | ||
667 | } | ||
668 | |||
669 | /* | ||
670 | * when extent_io.c finds a delayed allocation range in the file, | ||
671 | * the call backs end up in this code. The basic idea is to | ||
672 | * allocate extents on disk for the range, and create ordered data structs | ||
673 | * in ram to track those extents. | ||
674 | * | ||
675 | * locked_page is the page that writepage had locked already. We use | ||
676 | * it to make sure we don't do extra locks or unlocks. | ||
677 | * | ||
678 | * *page_started is set to one if we unlock locked_page and do everything | ||
679 | * required to start IO on it. It may be clean and already done with | ||
680 | * IO when we return. | ||
681 | */ | ||
682 | static noinline int cow_file_range(struct inode *inode, | ||
683 | struct page *locked_page, | ||
684 | u64 start, u64 end, int *page_started, | ||
685 | unsigned long *nr_written, | ||
686 | int unlock) | ||
687 | { | ||
688 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
689 | struct btrfs_trans_handle *trans; | ||
690 | u64 alloc_hint = 0; | ||
691 | u64 num_bytes; | ||
692 | unsigned long ram_size; | ||
693 | u64 disk_num_bytes; | ||
694 | u64 cur_alloc_size; | ||
695 | u64 blocksize = root->sectorsize; | ||
696 | u64 actual_end; | ||
697 | u64 isize = i_size_read(inode); | ||
698 | struct btrfs_key ins; | ||
699 | struct extent_map *em; | ||
700 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
701 | int ret = 0; | ||
702 | |||
703 | trans = btrfs_join_transaction(root, 1); | ||
704 | BUG_ON(!trans); | ||
705 | btrfs_set_trans_block_group(trans, inode); | ||
706 | |||
707 | actual_end = min_t(u64, isize, end + 1); | ||
708 | |||
709 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | ||
710 | num_bytes = max(blocksize, num_bytes); | ||
711 | disk_num_bytes = num_bytes; | ||
712 | ret = 0; | ||
713 | |||
714 | if (start == 0) { | ||
715 | /* lets try to make an inline extent */ | ||
716 | ret = cow_file_range_inline(trans, root, inode, | ||
717 | start, end, 0, NULL); | ||
718 | if (ret == 0) { | ||
719 | extent_clear_unlock_delalloc(inode, | ||
720 | &BTRFS_I(inode)->io_tree, | ||
721 | start, end, NULL, 1, 1, | ||
722 | 1, 1, 1, 1); | ||
723 | *nr_written = *nr_written + | ||
724 | (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; | ||
725 | *page_started = 1; | ||
726 | ret = 0; | ||
727 | goto out; | ||
728 | } | ||
729 | } | ||
730 | |||
731 | BUG_ON(disk_num_bytes > | ||
732 | btrfs_super_total_bytes(&root->fs_info->super_copy)); | ||
733 | |||
734 | btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); | ||
735 | |||
736 | while (disk_num_bytes > 0) { | ||
737 | cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); | ||
738 | ret = btrfs_reserve_extent(trans, root, cur_alloc_size, | ||
739 | root->sectorsize, 0, alloc_hint, | ||
740 | (u64)-1, &ins, 1); | ||
741 | BUG_ON(ret); | ||
742 | |||
743 | em = alloc_extent_map(GFP_NOFS); | ||
744 | em->start = start; | ||
745 | em->orig_start = em->start; | ||
746 | |||
747 | ram_size = ins.offset; | ||
748 | em->len = ins.offset; | ||
749 | |||
750 | em->block_start = ins.objectid; | ||
751 | em->block_len = ins.offset; | ||
752 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
753 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
754 | |||
755 | while (1) { | ||
756 | spin_lock(&em_tree->lock); | ||
757 | ret = add_extent_mapping(em_tree, em); | ||
758 | spin_unlock(&em_tree->lock); | ||
759 | if (ret != -EEXIST) { | ||
760 | free_extent_map(em); | ||
761 | break; | ||
762 | } | ||
763 | btrfs_drop_extent_cache(inode, start, | ||
764 | start + ram_size - 1, 0); | ||
765 | } | ||
766 | |||
767 | cur_alloc_size = ins.offset; | ||
768 | ret = btrfs_add_ordered_extent(inode, start, ins.objectid, | ||
769 | ram_size, cur_alloc_size, 0); | ||
770 | BUG_ON(ret); | ||
771 | |||
772 | if (root->root_key.objectid == | ||
773 | BTRFS_DATA_RELOC_TREE_OBJECTID) { | ||
774 | ret = btrfs_reloc_clone_csums(inode, start, | ||
775 | cur_alloc_size); | ||
776 | BUG_ON(ret); | ||
777 | } | ||
778 | |||
779 | if (disk_num_bytes < cur_alloc_size) | ||
780 | break; | ||
781 | |||
782 | /* we're not doing compressed IO, don't unlock the first | ||
783 | * page (which the caller expects to stay locked), don't | ||
784 | * clear any dirty bits and don't set any writeback bits | ||
785 | */ | ||
786 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | ||
787 | start, start + ram_size - 1, | ||
788 | locked_page, unlock, 1, | ||
789 | 1, 0, 0, 0); | ||
790 | disk_num_bytes -= cur_alloc_size; | ||
791 | num_bytes -= cur_alloc_size; | ||
792 | alloc_hint = ins.objectid + ins.offset; | ||
793 | start += cur_alloc_size; | ||
794 | } | ||
795 | out: | ||
796 | ret = 0; | ||
797 | btrfs_end_transaction(trans, root); | ||
798 | |||
799 | return ret; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * work queue call back to started compression on a file and pages | ||
804 | */ | ||
805 | static noinline void async_cow_start(struct btrfs_work *work) | ||
806 | { | ||
807 | struct async_cow *async_cow; | ||
808 | int num_added = 0; | ||
809 | async_cow = container_of(work, struct async_cow, work); | ||
810 | |||
811 | compress_file_range(async_cow->inode, async_cow->locked_page, | ||
812 | async_cow->start, async_cow->end, async_cow, | ||
813 | &num_added); | ||
814 | if (num_added == 0) | ||
815 | async_cow->inode = NULL; | ||
816 | } | ||
817 | |||
818 | /* | ||
819 | * work queue call back to submit previously compressed pages | ||
820 | */ | ||
821 | static noinline void async_cow_submit(struct btrfs_work *work) | ||
822 | { | ||
823 | struct async_cow *async_cow; | ||
824 | struct btrfs_root *root; | ||
825 | unsigned long nr_pages; | ||
826 | |||
827 | async_cow = container_of(work, struct async_cow, work); | ||
828 | |||
829 | root = async_cow->root; | ||
830 | nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> | ||
831 | PAGE_CACHE_SHIFT; | ||
832 | |||
833 | atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); | ||
834 | |||
835 | if (atomic_read(&root->fs_info->async_delalloc_pages) < | ||
836 | 5 * 1042 * 1024 && | ||
837 | waitqueue_active(&root->fs_info->async_submit_wait)) | ||
838 | wake_up(&root->fs_info->async_submit_wait); | ||
839 | |||
840 | if (async_cow->inode) | ||
841 | submit_compressed_extents(async_cow->inode, async_cow); | ||
842 | } | ||
843 | |||
844 | static noinline void async_cow_free(struct btrfs_work *work) | ||
845 | { | ||
846 | struct async_cow *async_cow; | ||
847 | async_cow = container_of(work, struct async_cow, work); | ||
848 | kfree(async_cow); | ||
849 | } | ||
850 | |||
851 | static int cow_file_range_async(struct inode *inode, struct page *locked_page, | ||
852 | u64 start, u64 end, int *page_started, | ||
853 | unsigned long *nr_written) | ||
854 | { | ||
855 | struct async_cow *async_cow; | ||
856 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
857 | unsigned long nr_pages; | ||
858 | u64 cur_end; | ||
859 | int limit = 10 * 1024 * 1042; | ||
860 | |||
861 | if (!btrfs_test_opt(root, COMPRESS)) { | ||
862 | return cow_file_range(inode, locked_page, start, end, | ||
863 | page_started, nr_written, 1); | ||
864 | } | ||
865 | |||
866 | clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | | ||
867 | EXTENT_DELALLOC, 1, 0, GFP_NOFS); | ||
868 | while (start < end) { | ||
869 | async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); | ||
870 | async_cow->inode = inode; | ||
871 | async_cow->root = root; | ||
872 | async_cow->locked_page = locked_page; | ||
873 | async_cow->start = start; | ||
874 | |||
875 | if (btrfs_test_flag(inode, NOCOMPRESS)) | ||
876 | cur_end = end; | ||
877 | else | ||
878 | cur_end = min(end, start + 512 * 1024 - 1); | ||
879 | |||
880 | async_cow->end = cur_end; | ||
881 | INIT_LIST_HEAD(&async_cow->extents); | ||
882 | |||
883 | async_cow->work.func = async_cow_start; | ||
884 | async_cow->work.ordered_func = async_cow_submit; | ||
885 | async_cow->work.ordered_free = async_cow_free; | ||
886 | async_cow->work.flags = 0; | ||
887 | |||
888 | nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> | ||
889 | PAGE_CACHE_SHIFT; | ||
890 | atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); | ||
891 | |||
892 | btrfs_queue_worker(&root->fs_info->delalloc_workers, | ||
893 | &async_cow->work); | ||
894 | |||
895 | if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { | ||
896 | wait_event(root->fs_info->async_submit_wait, | ||
897 | (atomic_read(&root->fs_info->async_delalloc_pages) < | ||
898 | limit)); | ||
899 | } | ||
900 | |||
901 | while (atomic_read(&root->fs_info->async_submit_draining) && | ||
902 | atomic_read(&root->fs_info->async_delalloc_pages)) { | ||
903 | wait_event(root->fs_info->async_submit_wait, | ||
904 | (atomic_read(&root->fs_info->async_delalloc_pages) == | ||
905 | 0)); | ||
906 | } | ||
907 | |||
908 | *nr_written += nr_pages; | ||
909 | start = cur_end + 1; | ||
910 | } | ||
911 | *page_started = 1; | ||
912 | return 0; | ||
913 | } | ||
914 | |||
915 | static noinline int csum_exist_in_range(struct btrfs_root *root, | ||
916 | u64 bytenr, u64 num_bytes) | ||
917 | { | ||
918 | int ret; | ||
919 | struct btrfs_ordered_sum *sums; | ||
920 | LIST_HEAD(list); | ||
921 | |||
922 | ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, | ||
923 | bytenr + num_bytes - 1, &list); | ||
924 | if (ret == 0 && list_empty(&list)) | ||
925 | return 0; | ||
926 | |||
927 | while (!list_empty(&list)) { | ||
928 | sums = list_entry(list.next, struct btrfs_ordered_sum, list); | ||
929 | list_del(&sums->list); | ||
930 | kfree(sums); | ||
931 | } | ||
932 | return 1; | ||
933 | } | ||
934 | |||
935 | /* | ||
936 | * when nowcow writeback call back. This checks for snapshots or COW copies | ||
937 | * of the extents that exist in the file, and COWs the file as required. | ||
938 | * | ||
939 | * If no cow copies or snapshots exist, we write directly to the existing | ||
940 | * blocks on disk | ||
941 | */ | ||
942 | static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, | ||
943 | u64 start, u64 end, int *page_started, int force, | ||
944 | unsigned long *nr_written) | ||
945 | { | ||
946 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
947 | struct btrfs_trans_handle *trans; | ||
948 | struct extent_buffer *leaf; | ||
949 | struct btrfs_path *path; | ||
950 | struct btrfs_file_extent_item *fi; | ||
951 | struct btrfs_key found_key; | ||
952 | u64 cow_start; | ||
953 | u64 cur_offset; | ||
954 | u64 extent_end; | ||
955 | u64 disk_bytenr; | ||
956 | u64 num_bytes; | ||
957 | int extent_type; | ||
958 | int ret; | ||
959 | int type; | ||
960 | int nocow; | ||
961 | int check_prev = 1; | ||
962 | |||
963 | path = btrfs_alloc_path(); | ||
964 | BUG_ON(!path); | ||
965 | trans = btrfs_join_transaction(root, 1); | ||
966 | BUG_ON(!trans); | ||
967 | |||
968 | cow_start = (u64)-1; | ||
969 | cur_offset = start; | ||
970 | while (1) { | ||
971 | ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, | ||
972 | cur_offset, 0); | ||
973 | BUG_ON(ret < 0); | ||
974 | if (ret > 0 && path->slots[0] > 0 && check_prev) { | ||
975 | leaf = path->nodes[0]; | ||
976 | btrfs_item_key_to_cpu(leaf, &found_key, | ||
977 | path->slots[0] - 1); | ||
978 | if (found_key.objectid == inode->i_ino && | ||
979 | found_key.type == BTRFS_EXTENT_DATA_KEY) | ||
980 | path->slots[0]--; | ||
981 | } | ||
982 | check_prev = 0; | ||
983 | next_slot: | ||
984 | leaf = path->nodes[0]; | ||
985 | if (path->slots[0] >= btrfs_header_nritems(leaf)) { | ||
986 | ret = btrfs_next_leaf(root, path); | ||
987 | if (ret < 0) | ||
988 | BUG_ON(1); | ||
989 | if (ret > 0) | ||
990 | break; | ||
991 | leaf = path->nodes[0]; | ||
992 | } | ||
993 | |||
994 | nocow = 0; | ||
995 | disk_bytenr = 0; | ||
996 | num_bytes = 0; | ||
997 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
998 | |||
999 | if (found_key.objectid > inode->i_ino || | ||
1000 | found_key.type > BTRFS_EXTENT_DATA_KEY || | ||
1001 | found_key.offset > end) | ||
1002 | break; | ||
1003 | |||
1004 | if (found_key.offset > cur_offset) { | ||
1005 | extent_end = found_key.offset; | ||
1006 | goto out_check; | ||
1007 | } | ||
1008 | |||
1009 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
1010 | struct btrfs_file_extent_item); | ||
1011 | extent_type = btrfs_file_extent_type(leaf, fi); | ||
1012 | |||
1013 | if (extent_type == BTRFS_FILE_EXTENT_REG || | ||
1014 | extent_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
1015 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); | ||
1016 | extent_end = found_key.offset + | ||
1017 | btrfs_file_extent_num_bytes(leaf, fi); | ||
1018 | if (extent_end <= start) { | ||
1019 | path->slots[0]++; | ||
1020 | goto next_slot; | ||
1021 | } | ||
1022 | if (disk_bytenr == 0) | ||
1023 | goto out_check; | ||
1024 | if (btrfs_file_extent_compression(leaf, fi) || | ||
1025 | btrfs_file_extent_encryption(leaf, fi) || | ||
1026 | btrfs_file_extent_other_encoding(leaf, fi)) | ||
1027 | goto out_check; | ||
1028 | if (extent_type == BTRFS_FILE_EXTENT_REG && !force) | ||
1029 | goto out_check; | ||
1030 | if (btrfs_extent_readonly(root, disk_bytenr)) | ||
1031 | goto out_check; | ||
1032 | if (btrfs_cross_ref_exist(trans, root, inode->i_ino, | ||
1033 | disk_bytenr)) | ||
1034 | goto out_check; | ||
1035 | disk_bytenr += btrfs_file_extent_offset(leaf, fi); | ||
1036 | disk_bytenr += cur_offset - found_key.offset; | ||
1037 | num_bytes = min(end + 1, extent_end) - cur_offset; | ||
1038 | /* | ||
1039 | * force cow if csum exists in the range. | ||
1040 | * this ensure that csum for a given extent are | ||
1041 | * either valid or do not exist. | ||
1042 | */ | ||
1043 | if (csum_exist_in_range(root, disk_bytenr, num_bytes)) | ||
1044 | goto out_check; | ||
1045 | nocow = 1; | ||
1046 | } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { | ||
1047 | extent_end = found_key.offset + | ||
1048 | btrfs_file_extent_inline_len(leaf, fi); | ||
1049 | extent_end = ALIGN(extent_end, root->sectorsize); | ||
1050 | } else { | ||
1051 | BUG_ON(1); | ||
1052 | } | ||
1053 | out_check: | ||
1054 | if (extent_end <= start) { | ||
1055 | path->slots[0]++; | ||
1056 | goto next_slot; | ||
1057 | } | ||
1058 | if (!nocow) { | ||
1059 | if (cow_start == (u64)-1) | ||
1060 | cow_start = cur_offset; | ||
1061 | cur_offset = extent_end; | ||
1062 | if (cur_offset > end) | ||
1063 | break; | ||
1064 | path->slots[0]++; | ||
1065 | goto next_slot; | ||
1066 | } | ||
1067 | |||
1068 | btrfs_release_path(root, path); | ||
1069 | if (cow_start != (u64)-1) { | ||
1070 | ret = cow_file_range(inode, locked_page, cow_start, | ||
1071 | found_key.offset - 1, page_started, | ||
1072 | nr_written, 1); | ||
1073 | BUG_ON(ret); | ||
1074 | cow_start = (u64)-1; | ||
1075 | } | ||
1076 | |||
1077 | if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
1078 | struct extent_map *em; | ||
1079 | struct extent_map_tree *em_tree; | ||
1080 | em_tree = &BTRFS_I(inode)->extent_tree; | ||
1081 | em = alloc_extent_map(GFP_NOFS); | ||
1082 | em->start = cur_offset; | ||
1083 | em->orig_start = em->start; | ||
1084 | em->len = num_bytes; | ||
1085 | em->block_len = num_bytes; | ||
1086 | em->block_start = disk_bytenr; | ||
1087 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
1088 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
1089 | while (1) { | ||
1090 | spin_lock(&em_tree->lock); | ||
1091 | ret = add_extent_mapping(em_tree, em); | ||
1092 | spin_unlock(&em_tree->lock); | ||
1093 | if (ret != -EEXIST) { | ||
1094 | free_extent_map(em); | ||
1095 | break; | ||
1096 | } | ||
1097 | btrfs_drop_extent_cache(inode, em->start, | ||
1098 | em->start + em->len - 1, 0); | ||
1099 | } | ||
1100 | type = BTRFS_ORDERED_PREALLOC; | ||
1101 | } else { | ||
1102 | type = BTRFS_ORDERED_NOCOW; | ||
1103 | } | ||
1104 | |||
1105 | ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, | ||
1106 | num_bytes, num_bytes, type); | ||
1107 | BUG_ON(ret); | ||
1108 | |||
1109 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | ||
1110 | cur_offset, cur_offset + num_bytes - 1, | ||
1111 | locked_page, 1, 1, 1, 0, 0, 0); | ||
1112 | cur_offset = extent_end; | ||
1113 | if (cur_offset > end) | ||
1114 | break; | ||
1115 | } | ||
1116 | btrfs_release_path(root, path); | ||
1117 | |||
1118 | if (cur_offset <= end && cow_start == (u64)-1) | ||
1119 | cow_start = cur_offset; | ||
1120 | if (cow_start != (u64)-1) { | ||
1121 | ret = cow_file_range(inode, locked_page, cow_start, end, | ||
1122 | page_started, nr_written, 1); | ||
1123 | BUG_ON(ret); | ||
1124 | } | ||
1125 | |||
1126 | ret = btrfs_end_transaction(trans, root); | ||
1127 | BUG_ON(ret); | ||
1128 | btrfs_free_path(path); | ||
1129 | return 0; | ||
1130 | } | ||
1131 | |||
1132 | /* | ||
1133 | * extent_io.c call back to do delayed allocation processing | ||
1134 | */ | ||
1135 | static int run_delalloc_range(struct inode *inode, struct page *locked_page, | ||
1136 | u64 start, u64 end, int *page_started, | ||
1137 | unsigned long *nr_written) | ||
1138 | { | ||
1139 | int ret; | ||
1140 | |||
1141 | if (btrfs_test_flag(inode, NODATACOW)) | ||
1142 | ret = run_delalloc_nocow(inode, locked_page, start, end, | ||
1143 | page_started, 1, nr_written); | ||
1144 | else if (btrfs_test_flag(inode, PREALLOC)) | ||
1145 | ret = run_delalloc_nocow(inode, locked_page, start, end, | ||
1146 | page_started, 0, nr_written); | ||
1147 | else | ||
1148 | ret = cow_file_range_async(inode, locked_page, start, end, | ||
1149 | page_started, nr_written); | ||
1150 | |||
1151 | return ret; | ||
1152 | } | ||
1153 | |||
1154 | /* | ||
1155 | * extent_io.c set_bit_hook, used to track delayed allocation | ||
1156 | * bytes in this file, and to maintain the list of inodes that | ||
1157 | * have pending delalloc work to be done. | ||
1158 | */ | ||
1159 | static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, | ||
1160 | unsigned long old, unsigned long bits) | ||
1161 | { | ||
1162 | /* | ||
1163 | * set_bit and clear bit hooks normally require _irqsave/restore | ||
1164 | * but in this case, we are only testeing for the DELALLOC | ||
1165 | * bit, which is only set or cleared with irqs on | ||
1166 | */ | ||
1167 | if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { | ||
1168 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1169 | spin_lock(&root->fs_info->delalloc_lock); | ||
1170 | BTRFS_I(inode)->delalloc_bytes += end - start + 1; | ||
1171 | root->fs_info->delalloc_bytes += end - start + 1; | ||
1172 | if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | ||
1173 | list_add_tail(&BTRFS_I(inode)->delalloc_inodes, | ||
1174 | &root->fs_info->delalloc_inodes); | ||
1175 | } | ||
1176 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1177 | } | ||
1178 | return 0; | ||
1179 | } | ||
1180 | |||
1181 | /* | ||
1182 | * extent_io.c clear_bit_hook, see set_bit_hook for why | ||
1183 | */ | ||
1184 | static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, | ||
1185 | unsigned long old, unsigned long bits) | ||
1186 | { | ||
1187 | /* | ||
1188 | * set_bit and clear bit hooks normally require _irqsave/restore | ||
1189 | * but in this case, we are only testeing for the DELALLOC | ||
1190 | * bit, which is only set or cleared with irqs on | ||
1191 | */ | ||
1192 | if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { | ||
1193 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1194 | |||
1195 | spin_lock(&root->fs_info->delalloc_lock); | ||
1196 | if (end - start + 1 > root->fs_info->delalloc_bytes) { | ||
1197 | printk(KERN_INFO "btrfs warning: delalloc account " | ||
1198 | "%llu %llu\n", | ||
1199 | (unsigned long long)end - start + 1, | ||
1200 | (unsigned long long) | ||
1201 | root->fs_info->delalloc_bytes); | ||
1202 | root->fs_info->delalloc_bytes = 0; | ||
1203 | BTRFS_I(inode)->delalloc_bytes = 0; | ||
1204 | } else { | ||
1205 | root->fs_info->delalloc_bytes -= end - start + 1; | ||
1206 | BTRFS_I(inode)->delalloc_bytes -= end - start + 1; | ||
1207 | } | ||
1208 | if (BTRFS_I(inode)->delalloc_bytes == 0 && | ||
1209 | !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | ||
1210 | list_del_init(&BTRFS_I(inode)->delalloc_inodes); | ||
1211 | } | ||
1212 | spin_unlock(&root->fs_info->delalloc_lock); | ||
1213 | } | ||
1214 | return 0; | ||
1215 | } | ||
1216 | |||
1217 | /* | ||
1218 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure | ||
1219 | * we don't create bios that span stripes or chunks | ||
1220 | */ | ||
1221 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | ||
1222 | size_t size, struct bio *bio, | ||
1223 | unsigned long bio_flags) | ||
1224 | { | ||
1225 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | ||
1226 | struct btrfs_mapping_tree *map_tree; | ||
1227 | u64 logical = (u64)bio->bi_sector << 9; | ||
1228 | u64 length = 0; | ||
1229 | u64 map_length; | ||
1230 | int ret; | ||
1231 | |||
1232 | if (bio_flags & EXTENT_BIO_COMPRESSED) | ||
1233 | return 0; | ||
1234 | |||
1235 | length = bio->bi_size; | ||
1236 | map_tree = &root->fs_info->mapping_tree; | ||
1237 | map_length = length; | ||
1238 | ret = btrfs_map_block(map_tree, READ, logical, | ||
1239 | &map_length, NULL, 0); | ||
1240 | |||
1241 | if (map_length < length + size) | ||
1242 | return 1; | ||
1243 | return 0; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * in order to insert checksums into the metadata in large chunks, | ||
1248 | * we wait until bio submission time. All the pages in the bio are | ||
1249 | * checksummed and sums are attached onto the ordered extent record. | ||
1250 | * | ||
1251 | * At IO completion time the cums attached on the ordered extent record | ||
1252 | * are inserted into the btree | ||
1253 | */ | ||
1254 | static int __btrfs_submit_bio_start(struct inode *inode, int rw, | ||
1255 | struct bio *bio, int mirror_num, | ||
1256 | unsigned long bio_flags) | ||
1257 | { | ||
1258 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1259 | int ret = 0; | ||
1260 | |||
1261 | ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); | ||
1262 | BUG_ON(ret); | ||
1263 | return 0; | ||
1264 | } | ||
1265 | |||
1266 | /* | ||
1267 | * in order to insert checksums into the metadata in large chunks, | ||
1268 | * we wait until bio submission time. All the pages in the bio are | ||
1269 | * checksummed and sums are attached onto the ordered extent record. | ||
1270 | * | ||
1271 | * At IO completion time the cums attached on the ordered extent record | ||
1272 | * are inserted into the btree | ||
1273 | */ | ||
1274 | static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, | ||
1275 | int mirror_num, unsigned long bio_flags) | ||
1276 | { | ||
1277 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1278 | return btrfs_map_bio(root, rw, bio, mirror_num, 1); | ||
1279 | } | ||
1280 | |||
1281 | /* | ||
1282 | * extent_io.c submission hook. This does the right thing for csum calculation | ||
1283 | * on write, or reading the csums from the tree before a read | ||
1284 | */ | ||
1285 | static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, | ||
1286 | int mirror_num, unsigned long bio_flags) | ||
1287 | { | ||
1288 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1289 | int ret = 0; | ||
1290 | int skip_sum; | ||
1291 | |||
1292 | skip_sum = btrfs_test_flag(inode, NODATASUM); | ||
1293 | |||
1294 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); | ||
1295 | BUG_ON(ret); | ||
1296 | |||
1297 | if (!(rw & (1 << BIO_RW))) { | ||
1298 | if (bio_flags & EXTENT_BIO_COMPRESSED) { | ||
1299 | return btrfs_submit_compressed_read(inode, bio, | ||
1300 | mirror_num, bio_flags); | ||
1301 | } else if (!skip_sum) | ||
1302 | btrfs_lookup_bio_sums(root, inode, bio, NULL); | ||
1303 | goto mapit; | ||
1304 | } else if (!skip_sum) { | ||
1305 | /* csum items have already been cloned */ | ||
1306 | if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) | ||
1307 | goto mapit; | ||
1308 | /* we're doing a write, do the async checksumming */ | ||
1309 | return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, | ||
1310 | inode, rw, bio, mirror_num, | ||
1311 | bio_flags, __btrfs_submit_bio_start, | ||
1312 | __btrfs_submit_bio_done); | ||
1313 | } | ||
1314 | |||
1315 | mapit: | ||
1316 | return btrfs_map_bio(root, rw, bio, mirror_num, 0); | ||
1317 | } | ||
1318 | |||
1319 | /* | ||
1320 | * given a list of ordered sums record them in the inode. This happens | ||
1321 | * at IO completion time based on sums calculated at bio submission time. | ||
1322 | */ | ||
1323 | static noinline int add_pending_csums(struct btrfs_trans_handle *trans, | ||
1324 | struct inode *inode, u64 file_offset, | ||
1325 | struct list_head *list) | ||
1326 | { | ||
1327 | struct list_head *cur; | ||
1328 | struct btrfs_ordered_sum *sum; | ||
1329 | |||
1330 | btrfs_set_trans_block_group(trans, inode); | ||
1331 | list_for_each(cur, list) { | ||
1332 | sum = list_entry(cur, struct btrfs_ordered_sum, list); | ||
1333 | btrfs_csum_file_blocks(trans, | ||
1334 | BTRFS_I(inode)->root->fs_info->csum_root, sum); | ||
1335 | } | ||
1336 | return 0; | ||
1337 | } | ||
1338 | |||
1339 | int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) | ||
1340 | { | ||
1341 | if ((end & (PAGE_CACHE_SIZE - 1)) == 0) | ||
1342 | WARN_ON(1); | ||
1343 | return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, | ||
1344 | GFP_NOFS); | ||
1345 | } | ||
1346 | |||
1347 | /* see btrfs_writepage_start_hook for details on why this is required */ | ||
1348 | struct btrfs_writepage_fixup { | ||
1349 | struct page *page; | ||
1350 | struct btrfs_work work; | ||
1351 | }; | ||
1352 | |||
1353 | static void btrfs_writepage_fixup_worker(struct btrfs_work *work) | ||
1354 | { | ||
1355 | struct btrfs_writepage_fixup *fixup; | ||
1356 | struct btrfs_ordered_extent *ordered; | ||
1357 | struct page *page; | ||
1358 | struct inode *inode; | ||
1359 | u64 page_start; | ||
1360 | u64 page_end; | ||
1361 | |||
1362 | fixup = container_of(work, struct btrfs_writepage_fixup, work); | ||
1363 | page = fixup->page; | ||
1364 | again: | ||
1365 | lock_page(page); | ||
1366 | if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { | ||
1367 | ClearPageChecked(page); | ||
1368 | goto out_page; | ||
1369 | } | ||
1370 | |||
1371 | inode = page->mapping->host; | ||
1372 | page_start = page_offset(page); | ||
1373 | page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; | ||
1374 | |||
1375 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); | ||
1376 | |||
1377 | /* already ordered? We're done */ | ||
1378 | if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, | ||
1379 | EXTENT_ORDERED, 0)) { | ||
1380 | goto out; | ||
1381 | } | ||
1382 | |||
1383 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | ||
1384 | if (ordered) { | ||
1385 | unlock_extent(&BTRFS_I(inode)->io_tree, page_start, | ||
1386 | page_end, GFP_NOFS); | ||
1387 | unlock_page(page); | ||
1388 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
1389 | goto again; | ||
1390 | } | ||
1391 | |||
1392 | btrfs_set_extent_delalloc(inode, page_start, page_end); | ||
1393 | ClearPageChecked(page); | ||
1394 | out: | ||
1395 | unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); | ||
1396 | out_page: | ||
1397 | unlock_page(page); | ||
1398 | page_cache_release(page); | ||
1399 | } | ||
1400 | |||
1401 | /* | ||
1402 | * There are a few paths in the higher layers of the kernel that directly | ||
1403 | * set the page dirty bit without asking the filesystem if it is a | ||
1404 | * good idea. This causes problems because we want to make sure COW | ||
1405 | * properly happens and the data=ordered rules are followed. | ||
1406 | * | ||
1407 | * In our case any range that doesn't have the ORDERED bit set | ||
1408 | * hasn't been properly setup for IO. We kick off an async process | ||
1409 | * to fix it up. The async helper will wait for ordered extents, set | ||
1410 | * the delalloc bit and make it safe to write the page. | ||
1411 | */ | ||
1412 | static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) | ||
1413 | { | ||
1414 | struct inode *inode = page->mapping->host; | ||
1415 | struct btrfs_writepage_fixup *fixup; | ||
1416 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1417 | int ret; | ||
1418 | |||
1419 | ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, | ||
1420 | EXTENT_ORDERED, 0); | ||
1421 | if (ret) | ||
1422 | return 0; | ||
1423 | |||
1424 | if (PageChecked(page)) | ||
1425 | return -EAGAIN; | ||
1426 | |||
1427 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
1428 | if (!fixup) | ||
1429 | return -EAGAIN; | ||
1430 | |||
1431 | SetPageChecked(page); | ||
1432 | page_cache_get(page); | ||
1433 | fixup->work.func = btrfs_writepage_fixup_worker; | ||
1434 | fixup->page = page; | ||
1435 | btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); | ||
1436 | return -EAGAIN; | ||
1437 | } | ||
1438 | |||
1439 | static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, | ||
1440 | struct inode *inode, u64 file_pos, | ||
1441 | u64 disk_bytenr, u64 disk_num_bytes, | ||
1442 | u64 num_bytes, u64 ram_bytes, | ||
1443 | u8 compression, u8 encryption, | ||
1444 | u16 other_encoding, int extent_type) | ||
1445 | { | ||
1446 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1447 | struct btrfs_file_extent_item *fi; | ||
1448 | struct btrfs_path *path; | ||
1449 | struct extent_buffer *leaf; | ||
1450 | struct btrfs_key ins; | ||
1451 | u64 hint; | ||
1452 | int ret; | ||
1453 | |||
1454 | path = btrfs_alloc_path(); | ||
1455 | BUG_ON(!path); | ||
1456 | |||
1457 | ret = btrfs_drop_extents(trans, root, inode, file_pos, | ||
1458 | file_pos + num_bytes, file_pos, &hint); | ||
1459 | BUG_ON(ret); | ||
1460 | |||
1461 | ins.objectid = inode->i_ino; | ||
1462 | ins.offset = file_pos; | ||
1463 | ins.type = BTRFS_EXTENT_DATA_KEY; | ||
1464 | ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); | ||
1465 | BUG_ON(ret); | ||
1466 | leaf = path->nodes[0]; | ||
1467 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
1468 | struct btrfs_file_extent_item); | ||
1469 | btrfs_set_file_extent_generation(leaf, fi, trans->transid); | ||
1470 | btrfs_set_file_extent_type(leaf, fi, extent_type); | ||
1471 | btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); | ||
1472 | btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); | ||
1473 | btrfs_set_file_extent_offset(leaf, fi, 0); | ||
1474 | btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); | ||
1475 | btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); | ||
1476 | btrfs_set_file_extent_compression(leaf, fi, compression); | ||
1477 | btrfs_set_file_extent_encryption(leaf, fi, encryption); | ||
1478 | btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); | ||
1479 | btrfs_mark_buffer_dirty(leaf); | ||
1480 | |||
1481 | inode_add_bytes(inode, num_bytes); | ||
1482 | btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); | ||
1483 | |||
1484 | ins.objectid = disk_bytenr; | ||
1485 | ins.offset = disk_num_bytes; | ||
1486 | ins.type = BTRFS_EXTENT_ITEM_KEY; | ||
1487 | ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, | ||
1488 | root->root_key.objectid, | ||
1489 | trans->transid, inode->i_ino, &ins); | ||
1490 | BUG_ON(ret); | ||
1491 | |||
1492 | btrfs_free_path(path); | ||
1493 | return 0; | ||
1494 | } | ||
1495 | |||
1496 | /* as ordered data IO finishes, this gets called so we can finish | ||
1497 | * an ordered extent if the range of bytes in the file it covers are | ||
1498 | * fully written. | ||
1499 | */ | ||
1500 | static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | ||
1501 | { | ||
1502 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1503 | struct btrfs_trans_handle *trans; | ||
1504 | struct btrfs_ordered_extent *ordered_extent; | ||
1505 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
1506 | int compressed = 0; | ||
1507 | int ret; | ||
1508 | |||
1509 | ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); | ||
1510 | if (!ret) | ||
1511 | return 0; | ||
1512 | |||
1513 | trans = btrfs_join_transaction(root, 1); | ||
1514 | |||
1515 | ordered_extent = btrfs_lookup_ordered_extent(inode, start); | ||
1516 | BUG_ON(!ordered_extent); | ||
1517 | if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) | ||
1518 | goto nocow; | ||
1519 | |||
1520 | lock_extent(io_tree, ordered_extent->file_offset, | ||
1521 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
1522 | GFP_NOFS); | ||
1523 | |||
1524 | if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) | ||
1525 | compressed = 1; | ||
1526 | if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { | ||
1527 | BUG_ON(compressed); | ||
1528 | ret = btrfs_mark_extent_written(trans, root, inode, | ||
1529 | ordered_extent->file_offset, | ||
1530 | ordered_extent->file_offset + | ||
1531 | ordered_extent->len); | ||
1532 | BUG_ON(ret); | ||
1533 | } else { | ||
1534 | ret = insert_reserved_file_extent(trans, inode, | ||
1535 | ordered_extent->file_offset, | ||
1536 | ordered_extent->start, | ||
1537 | ordered_extent->disk_len, | ||
1538 | ordered_extent->len, | ||
1539 | ordered_extent->len, | ||
1540 | compressed, 0, 0, | ||
1541 | BTRFS_FILE_EXTENT_REG); | ||
1542 | BUG_ON(ret); | ||
1543 | } | ||
1544 | unlock_extent(io_tree, ordered_extent->file_offset, | ||
1545 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
1546 | GFP_NOFS); | ||
1547 | nocow: | ||
1548 | add_pending_csums(trans, inode, ordered_extent->file_offset, | ||
1549 | &ordered_extent->list); | ||
1550 | |||
1551 | mutex_lock(&BTRFS_I(inode)->extent_mutex); | ||
1552 | btrfs_ordered_update_i_size(inode, ordered_extent); | ||
1553 | btrfs_update_inode(trans, root, inode); | ||
1554 | btrfs_remove_ordered_extent(inode, ordered_extent); | ||
1555 | mutex_unlock(&BTRFS_I(inode)->extent_mutex); | ||
1556 | |||
1557 | /* once for us */ | ||
1558 | btrfs_put_ordered_extent(ordered_extent); | ||
1559 | /* once for the tree */ | ||
1560 | btrfs_put_ordered_extent(ordered_extent); | ||
1561 | |||
1562 | btrfs_end_transaction(trans, root); | ||
1563 | return 0; | ||
1564 | } | ||
1565 | |||
1566 | static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | ||
1567 | struct extent_state *state, int uptodate) | ||
1568 | { | ||
1569 | return btrfs_finish_ordered_io(page->mapping->host, start, end); | ||
1570 | } | ||
1571 | |||
1572 | /* | ||
1573 | * When IO fails, either with EIO or csum verification fails, we | ||
1574 | * try other mirrors that might have a good copy of the data. This | ||
1575 | * io_failure_record is used to record state as we go through all the | ||
1576 | * mirrors. If another mirror has good data, the page is set up to date | ||
1577 | * and things continue. If a good mirror can't be found, the original | ||
1578 | * bio end_io callback is called to indicate things have failed. | ||
1579 | */ | ||
1580 | struct io_failure_record { | ||
1581 | struct page *page; | ||
1582 | u64 start; | ||
1583 | u64 len; | ||
1584 | u64 logical; | ||
1585 | unsigned long bio_flags; | ||
1586 | int last_mirror; | ||
1587 | }; | ||
1588 | |||
1589 | static int btrfs_io_failed_hook(struct bio *failed_bio, | ||
1590 | struct page *page, u64 start, u64 end, | ||
1591 | struct extent_state *state) | ||
1592 | { | ||
1593 | struct io_failure_record *failrec = NULL; | ||
1594 | u64 private; | ||
1595 | struct extent_map *em; | ||
1596 | struct inode *inode = page->mapping->host; | ||
1597 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1598 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
1599 | struct bio *bio; | ||
1600 | int num_copies; | ||
1601 | int ret; | ||
1602 | int rw; | ||
1603 | u64 logical; | ||
1604 | |||
1605 | ret = get_state_private(failure_tree, start, &private); | ||
1606 | if (ret) { | ||
1607 | failrec = kmalloc(sizeof(*failrec), GFP_NOFS); | ||
1608 | if (!failrec) | ||
1609 | return -ENOMEM; | ||
1610 | failrec->start = start; | ||
1611 | failrec->len = end - start + 1; | ||
1612 | failrec->last_mirror = 0; | ||
1613 | failrec->bio_flags = 0; | ||
1614 | |||
1615 | spin_lock(&em_tree->lock); | ||
1616 | em = lookup_extent_mapping(em_tree, start, failrec->len); | ||
1617 | if (em->start > start || em->start + em->len < start) { | ||
1618 | free_extent_map(em); | ||
1619 | em = NULL; | ||
1620 | } | ||
1621 | spin_unlock(&em_tree->lock); | ||
1622 | |||
1623 | if (!em || IS_ERR(em)) { | ||
1624 | kfree(failrec); | ||
1625 | return -EIO; | ||
1626 | } | ||
1627 | logical = start - em->start; | ||
1628 | logical = em->block_start + logical; | ||
1629 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | ||
1630 | logical = em->block_start; | ||
1631 | failrec->bio_flags = EXTENT_BIO_COMPRESSED; | ||
1632 | } | ||
1633 | failrec->logical = logical; | ||
1634 | free_extent_map(em); | ||
1635 | set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | | ||
1636 | EXTENT_DIRTY, GFP_NOFS); | ||
1637 | set_state_private(failure_tree, start, | ||
1638 | (u64)(unsigned long)failrec); | ||
1639 | } else { | ||
1640 | failrec = (struct io_failure_record *)(unsigned long)private; | ||
1641 | } | ||
1642 | num_copies = btrfs_num_copies( | ||
1643 | &BTRFS_I(inode)->root->fs_info->mapping_tree, | ||
1644 | failrec->logical, failrec->len); | ||
1645 | failrec->last_mirror++; | ||
1646 | if (!state) { | ||
1647 | spin_lock(&BTRFS_I(inode)->io_tree.lock); | ||
1648 | state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, | ||
1649 | failrec->start, | ||
1650 | EXTENT_LOCKED); | ||
1651 | if (state && state->start != failrec->start) | ||
1652 | state = NULL; | ||
1653 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); | ||
1654 | } | ||
1655 | if (!state || failrec->last_mirror > num_copies) { | ||
1656 | set_state_private(failure_tree, failrec->start, 0); | ||
1657 | clear_extent_bits(failure_tree, failrec->start, | ||
1658 | failrec->start + failrec->len - 1, | ||
1659 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
1660 | kfree(failrec); | ||
1661 | return -EIO; | ||
1662 | } | ||
1663 | bio = bio_alloc(GFP_NOFS, 1); | ||
1664 | bio->bi_private = state; | ||
1665 | bio->bi_end_io = failed_bio->bi_end_io; | ||
1666 | bio->bi_sector = failrec->logical >> 9; | ||
1667 | bio->bi_bdev = failed_bio->bi_bdev; | ||
1668 | bio->bi_size = 0; | ||
1669 | |||
1670 | bio_add_page(bio, page, failrec->len, start - page_offset(page)); | ||
1671 | if (failed_bio->bi_rw & (1 << BIO_RW)) | ||
1672 | rw = WRITE; | ||
1673 | else | ||
1674 | rw = READ; | ||
1675 | |||
1676 | BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, | ||
1677 | failrec->last_mirror, | ||
1678 | failrec->bio_flags); | ||
1679 | return 0; | ||
1680 | } | ||
1681 | |||
1682 | /* | ||
1683 | * each time an IO finishes, we do a fast check in the IO failure tree | ||
1684 | * to see if we need to process or clean up an io_failure_record | ||
1685 | */ | ||
1686 | static int btrfs_clean_io_failures(struct inode *inode, u64 start) | ||
1687 | { | ||
1688 | u64 private; | ||
1689 | u64 private_failure; | ||
1690 | struct io_failure_record *failure; | ||
1691 | int ret; | ||
1692 | |||
1693 | private = 0; | ||
1694 | if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, | ||
1695 | (u64)-1, 1, EXTENT_DIRTY)) { | ||
1696 | ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, | ||
1697 | start, &private_failure); | ||
1698 | if (ret == 0) { | ||
1699 | failure = (struct io_failure_record *)(unsigned long) | ||
1700 | private_failure; | ||
1701 | set_state_private(&BTRFS_I(inode)->io_failure_tree, | ||
1702 | failure->start, 0); | ||
1703 | clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, | ||
1704 | failure->start, | ||
1705 | failure->start + failure->len - 1, | ||
1706 | EXTENT_DIRTY | EXTENT_LOCKED, | ||
1707 | GFP_NOFS); | ||
1708 | kfree(failure); | ||
1709 | } | ||
1710 | } | ||
1711 | return 0; | ||
1712 | } | ||
1713 | |||
1714 | /* | ||
1715 | * when reads are done, we need to check csums to verify the data is correct | ||
1716 | * if there's a match, we allow the bio to finish. If not, we go through | ||
1717 | * the io_failure_record routines to find good copies | ||
1718 | */ | ||
1719 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, | ||
1720 | struct extent_state *state) | ||
1721 | { | ||
1722 | size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); | ||
1723 | struct inode *inode = page->mapping->host; | ||
1724 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
1725 | char *kaddr; | ||
1726 | u64 private = ~(u32)0; | ||
1727 | int ret; | ||
1728 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1729 | u32 csum = ~(u32)0; | ||
1730 | |||
1731 | if (PageChecked(page)) { | ||
1732 | ClearPageChecked(page); | ||
1733 | goto good; | ||
1734 | } | ||
1735 | if (btrfs_test_flag(inode, NODATASUM)) | ||
1736 | return 0; | ||
1737 | |||
1738 | if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && | ||
1739 | test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { | ||
1740 | clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, | ||
1741 | GFP_NOFS); | ||
1742 | return 0; | ||
1743 | } | ||
1744 | |||
1745 | if (state && state->start == start) { | ||
1746 | private = state->private; | ||
1747 | ret = 0; | ||
1748 | } else { | ||
1749 | ret = get_state_private(io_tree, start, &private); | ||
1750 | } | ||
1751 | kaddr = kmap_atomic(page, KM_USER0); | ||
1752 | if (ret) | ||
1753 | goto zeroit; | ||
1754 | |||
1755 | csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); | ||
1756 | btrfs_csum_final(csum, (char *)&csum); | ||
1757 | if (csum != private) | ||
1758 | goto zeroit; | ||
1759 | |||
1760 | kunmap_atomic(kaddr, KM_USER0); | ||
1761 | good: | ||
1762 | /* if the io failure tree for this inode is non-empty, | ||
1763 | * check to see if we've recovered from a failed IO | ||
1764 | */ | ||
1765 | btrfs_clean_io_failures(inode, start); | ||
1766 | return 0; | ||
1767 | |||
1768 | zeroit: | ||
1769 | printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " | ||
1770 | "private %llu\n", page->mapping->host->i_ino, | ||
1771 | (unsigned long long)start, csum, | ||
1772 | (unsigned long long)private); | ||
1773 | memset(kaddr + offset, 1, end - start + 1); | ||
1774 | flush_dcache_page(page); | ||
1775 | kunmap_atomic(kaddr, KM_USER0); | ||
1776 | if (private == 0) | ||
1777 | return 0; | ||
1778 | return -EIO; | ||
1779 | } | ||
1780 | |||
1781 | /* | ||
1782 | * This creates an orphan entry for the given inode in case something goes | ||
1783 | * wrong in the middle of an unlink/truncate. | ||
1784 | */ | ||
1785 | int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) | ||
1786 | { | ||
1787 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1788 | int ret = 0; | ||
1789 | |||
1790 | spin_lock(&root->list_lock); | ||
1791 | |||
1792 | /* already on the orphan list, we're good */ | ||
1793 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { | ||
1794 | spin_unlock(&root->list_lock); | ||
1795 | return 0; | ||
1796 | } | ||
1797 | |||
1798 | list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); | ||
1799 | |||
1800 | spin_unlock(&root->list_lock); | ||
1801 | |||
1802 | /* | ||
1803 | * insert an orphan item to track this unlinked/truncated file | ||
1804 | */ | ||
1805 | ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); | ||
1806 | |||
1807 | return ret; | ||
1808 | } | ||
1809 | |||
1810 | /* | ||
1811 | * We have done the truncate/delete so we can go ahead and remove the orphan | ||
1812 | * item for this particular inode. | ||
1813 | */ | ||
1814 | int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) | ||
1815 | { | ||
1816 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1817 | int ret = 0; | ||
1818 | |||
1819 | spin_lock(&root->list_lock); | ||
1820 | |||
1821 | if (list_empty(&BTRFS_I(inode)->i_orphan)) { | ||
1822 | spin_unlock(&root->list_lock); | ||
1823 | return 0; | ||
1824 | } | ||
1825 | |||
1826 | list_del_init(&BTRFS_I(inode)->i_orphan); | ||
1827 | if (!trans) { | ||
1828 | spin_unlock(&root->list_lock); | ||
1829 | return 0; | ||
1830 | } | ||
1831 | |||
1832 | spin_unlock(&root->list_lock); | ||
1833 | |||
1834 | ret = btrfs_del_orphan_item(trans, root, inode->i_ino); | ||
1835 | |||
1836 | return ret; | ||
1837 | } | ||
1838 | |||
1839 | /* | ||
1840 | * this cleans up any orphans that may be left on the list from the last use | ||
1841 | * of this root. | ||
1842 | */ | ||
1843 | void btrfs_orphan_cleanup(struct btrfs_root *root) | ||
1844 | { | ||
1845 | struct btrfs_path *path; | ||
1846 | struct extent_buffer *leaf; | ||
1847 | struct btrfs_item *item; | ||
1848 | struct btrfs_key key, found_key; | ||
1849 | struct btrfs_trans_handle *trans; | ||
1850 | struct inode *inode; | ||
1851 | int ret = 0, nr_unlink = 0, nr_truncate = 0; | ||
1852 | |||
1853 | path = btrfs_alloc_path(); | ||
1854 | if (!path) | ||
1855 | return; | ||
1856 | path->reada = -1; | ||
1857 | |||
1858 | key.objectid = BTRFS_ORPHAN_OBJECTID; | ||
1859 | btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); | ||
1860 | key.offset = (u64)-1; | ||
1861 | |||
1862 | |||
1863 | while (1) { | ||
1864 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
1865 | if (ret < 0) { | ||
1866 | printk(KERN_ERR "Error searching slot for orphan: %d" | ||
1867 | "\n", ret); | ||
1868 | break; | ||
1869 | } | ||
1870 | |||
1871 | /* | ||
1872 | * if ret == 0 means we found what we were searching for, which | ||
1873 | * is weird, but possible, so only screw with path if we didnt | ||
1874 | * find the key and see if we have stuff that matches | ||
1875 | */ | ||
1876 | if (ret > 0) { | ||
1877 | if (path->slots[0] == 0) | ||
1878 | break; | ||
1879 | path->slots[0]--; | ||
1880 | } | ||
1881 | |||
1882 | /* pull out the item */ | ||
1883 | leaf = path->nodes[0]; | ||
1884 | item = btrfs_item_nr(leaf, path->slots[0]); | ||
1885 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
1886 | |||
1887 | /* make sure the item matches what we want */ | ||
1888 | if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) | ||
1889 | break; | ||
1890 | if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) | ||
1891 | break; | ||
1892 | |||
1893 | /* release the path since we're done with it */ | ||
1894 | btrfs_release_path(root, path); | ||
1895 | |||
1896 | /* | ||
1897 | * this is where we are basically btrfs_lookup, without the | ||
1898 | * crossing root thing. we store the inode number in the | ||
1899 | * offset of the orphan item. | ||
1900 | */ | ||
1901 | inode = btrfs_iget_locked(root->fs_info->sb, | ||
1902 | found_key.offset, root); | ||
1903 | if (!inode) | ||
1904 | break; | ||
1905 | |||
1906 | if (inode->i_state & I_NEW) { | ||
1907 | BTRFS_I(inode)->root = root; | ||
1908 | |||
1909 | /* have to set the location manually */ | ||
1910 | BTRFS_I(inode)->location.objectid = inode->i_ino; | ||
1911 | BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; | ||
1912 | BTRFS_I(inode)->location.offset = 0; | ||
1913 | |||
1914 | btrfs_read_locked_inode(inode); | ||
1915 | unlock_new_inode(inode); | ||
1916 | } | ||
1917 | |||
1918 | /* | ||
1919 | * add this inode to the orphan list so btrfs_orphan_del does | ||
1920 | * the proper thing when we hit it | ||
1921 | */ | ||
1922 | spin_lock(&root->list_lock); | ||
1923 | list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); | ||
1924 | spin_unlock(&root->list_lock); | ||
1925 | |||
1926 | /* | ||
1927 | * if this is a bad inode, means we actually succeeded in | ||
1928 | * removing the inode, but not the orphan record, which means | ||
1929 | * we need to manually delete the orphan since iput will just | ||
1930 | * do a destroy_inode | ||
1931 | */ | ||
1932 | if (is_bad_inode(inode)) { | ||
1933 | trans = btrfs_start_transaction(root, 1); | ||
1934 | btrfs_orphan_del(trans, inode); | ||
1935 | btrfs_end_transaction(trans, root); | ||
1936 | iput(inode); | ||
1937 | continue; | ||
1938 | } | ||
1939 | |||
1940 | /* if we have links, this was a truncate, lets do that */ | ||
1941 | if (inode->i_nlink) { | ||
1942 | nr_truncate++; | ||
1943 | btrfs_truncate(inode); | ||
1944 | } else { | ||
1945 | nr_unlink++; | ||
1946 | } | ||
1947 | |||
1948 | /* this will do delete_inode and everything for us */ | ||
1949 | iput(inode); | ||
1950 | } | ||
1951 | |||
1952 | if (nr_unlink) | ||
1953 | printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); | ||
1954 | if (nr_truncate) | ||
1955 | printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); | ||
1956 | |||
1957 | btrfs_free_path(path); | ||
1958 | } | ||
1959 | |||
1960 | /* | ||
1961 | * read an inode from the btree into the in-memory inode | ||
1962 | */ | ||
1963 | void btrfs_read_locked_inode(struct inode *inode) | ||
1964 | { | ||
1965 | struct btrfs_path *path; | ||
1966 | struct extent_buffer *leaf; | ||
1967 | struct btrfs_inode_item *inode_item; | ||
1968 | struct btrfs_timespec *tspec; | ||
1969 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1970 | struct btrfs_key location; | ||
1971 | u64 alloc_group_block; | ||
1972 | u32 rdev; | ||
1973 | int ret; | ||
1974 | |||
1975 | path = btrfs_alloc_path(); | ||
1976 | BUG_ON(!path); | ||
1977 | memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); | ||
1978 | |||
1979 | ret = btrfs_lookup_inode(NULL, root, path, &location, 0); | ||
1980 | if (ret) | ||
1981 | goto make_bad; | ||
1982 | |||
1983 | leaf = path->nodes[0]; | ||
1984 | inode_item = btrfs_item_ptr(leaf, path->slots[0], | ||
1985 | struct btrfs_inode_item); | ||
1986 | |||
1987 | inode->i_mode = btrfs_inode_mode(leaf, inode_item); | ||
1988 | inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); | ||
1989 | inode->i_uid = btrfs_inode_uid(leaf, inode_item); | ||
1990 | inode->i_gid = btrfs_inode_gid(leaf, inode_item); | ||
1991 | btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); | ||
1992 | |||
1993 | tspec = btrfs_inode_atime(inode_item); | ||
1994 | inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); | ||
1995 | inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); | ||
1996 | |||
1997 | tspec = btrfs_inode_mtime(inode_item); | ||
1998 | inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); | ||
1999 | inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); | ||
2000 | |||
2001 | tspec = btrfs_inode_ctime(inode_item); | ||
2002 | inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); | ||
2003 | inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); | ||
2004 | |||
2005 | inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); | ||
2006 | BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); | ||
2007 | BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); | ||
2008 | inode->i_generation = BTRFS_I(inode)->generation; | ||
2009 | inode->i_rdev = 0; | ||
2010 | rdev = btrfs_inode_rdev(leaf, inode_item); | ||
2011 | |||
2012 | BTRFS_I(inode)->index_cnt = (u64)-1; | ||
2013 | BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); | ||
2014 | |||
2015 | alloc_group_block = btrfs_inode_block_group(leaf, inode_item); | ||
2016 | BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, | ||
2017 | alloc_group_block, 0); | ||
2018 | btrfs_free_path(path); | ||
2019 | inode_item = NULL; | ||
2020 | |||
2021 | switch (inode->i_mode & S_IFMT) { | ||
2022 | case S_IFREG: | ||
2023 | inode->i_mapping->a_ops = &btrfs_aops; | ||
2024 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | ||
2025 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | ||
2026 | inode->i_fop = &btrfs_file_operations; | ||
2027 | inode->i_op = &btrfs_file_inode_operations; | ||
2028 | break; | ||
2029 | case S_IFDIR: | ||
2030 | inode->i_fop = &btrfs_dir_file_operations; | ||
2031 | if (root == root->fs_info->tree_root) | ||
2032 | inode->i_op = &btrfs_dir_ro_inode_operations; | ||
2033 | else | ||
2034 | inode->i_op = &btrfs_dir_inode_operations; | ||
2035 | break; | ||
2036 | case S_IFLNK: | ||
2037 | inode->i_op = &btrfs_symlink_inode_operations; | ||
2038 | inode->i_mapping->a_ops = &btrfs_symlink_aops; | ||
2039 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | ||
2040 | break; | ||
2041 | default: | ||
2042 | init_special_inode(inode, inode->i_mode, rdev); | ||
2043 | break; | ||
2044 | } | ||
2045 | return; | ||
2046 | |||
2047 | make_bad: | ||
2048 | btrfs_free_path(path); | ||
2049 | make_bad_inode(inode); | ||
2050 | } | ||
2051 | |||
2052 | /* | ||
2053 | * given a leaf and an inode, copy the inode fields into the leaf | ||
2054 | */ | ||
2055 | static void fill_inode_item(struct btrfs_trans_handle *trans, | ||
2056 | struct extent_buffer *leaf, | ||
2057 | struct btrfs_inode_item *item, | ||
2058 | struct inode *inode) | ||
2059 | { | ||
2060 | btrfs_set_inode_uid(leaf, item, inode->i_uid); | ||
2061 | btrfs_set_inode_gid(leaf, item, inode->i_gid); | ||
2062 | btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); | ||
2063 | btrfs_set_inode_mode(leaf, item, inode->i_mode); | ||
2064 | btrfs_set_inode_nlink(leaf, item, inode->i_nlink); | ||
2065 | |||
2066 | btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), | ||
2067 | inode->i_atime.tv_sec); | ||
2068 | btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), | ||
2069 | inode->i_atime.tv_nsec); | ||
2070 | |||
2071 | btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), | ||
2072 | inode->i_mtime.tv_sec); | ||
2073 | btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), | ||
2074 | inode->i_mtime.tv_nsec); | ||
2075 | |||
2076 | btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), | ||
2077 | inode->i_ctime.tv_sec); | ||
2078 | btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), | ||
2079 | inode->i_ctime.tv_nsec); | ||
2080 | |||
2081 | btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); | ||
2082 | btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); | ||
2083 | btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); | ||
2084 | btrfs_set_inode_transid(leaf, item, trans->transid); | ||
2085 | btrfs_set_inode_rdev(leaf, item, inode->i_rdev); | ||
2086 | btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); | ||
2087 | btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); | ||
2088 | } | ||
2089 | |||
2090 | /* | ||
2091 | * copy everything in the in-memory inode into the btree. | ||
2092 | */ | ||
2093 | noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, | ||
2094 | struct btrfs_root *root, struct inode *inode) | ||
2095 | { | ||
2096 | struct btrfs_inode_item *inode_item; | ||
2097 | struct btrfs_path *path; | ||
2098 | struct extent_buffer *leaf; | ||
2099 | int ret; | ||
2100 | |||
2101 | path = btrfs_alloc_path(); | ||
2102 | BUG_ON(!path); | ||
2103 | ret = btrfs_lookup_inode(trans, root, path, | ||
2104 | &BTRFS_I(inode)->location, 1); | ||
2105 | if (ret) { | ||
2106 | if (ret > 0) | ||
2107 | ret = -ENOENT; | ||
2108 | goto failed; | ||
2109 | } | ||
2110 | |||
2111 | leaf = path->nodes[0]; | ||
2112 | inode_item = btrfs_item_ptr(leaf, path->slots[0], | ||
2113 | struct btrfs_inode_item); | ||
2114 | |||
2115 | fill_inode_item(trans, leaf, inode_item, inode); | ||
2116 | btrfs_mark_buffer_dirty(leaf); | ||
2117 | btrfs_set_inode_last_trans(trans, inode); | ||
2118 | ret = 0; | ||
2119 | failed: | ||
2120 | btrfs_free_path(path); | ||
2121 | return ret; | ||
2122 | } | ||
2123 | |||
2124 | |||
2125 | /* | ||
2126 | * unlink helper that gets used here in inode.c and in the tree logging | ||
2127 | * recovery code. It remove a link in a directory with a given name, and | ||
2128 | * also drops the back refs in the inode to the directory | ||
2129 | */ | ||
2130 | int btrfs_unlink_inode(struct btrfs_trans_handle *trans, | ||
2131 | struct btrfs_root *root, | ||
2132 | struct inode *dir, struct inode *inode, | ||
2133 | const char *name, int name_len) | ||
2134 | { | ||
2135 | struct btrfs_path *path; | ||
2136 | int ret = 0; | ||
2137 | struct extent_buffer *leaf; | ||
2138 | struct btrfs_dir_item *di; | ||
2139 | struct btrfs_key key; | ||
2140 | u64 index; | ||
2141 | |||
2142 | path = btrfs_alloc_path(); | ||
2143 | if (!path) { | ||
2144 | ret = -ENOMEM; | ||
2145 | goto err; | ||
2146 | } | ||
2147 | |||
2148 | di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, | ||
2149 | name, name_len, -1); | ||
2150 | if (IS_ERR(di)) { | ||
2151 | ret = PTR_ERR(di); | ||
2152 | goto err; | ||
2153 | } | ||
2154 | if (!di) { | ||
2155 | ret = -ENOENT; | ||
2156 | goto err; | ||
2157 | } | ||
2158 | leaf = path->nodes[0]; | ||
2159 | btrfs_dir_item_key_to_cpu(leaf, di, &key); | ||
2160 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
2161 | if (ret) | ||
2162 | goto err; | ||
2163 | btrfs_release_path(root, path); | ||
2164 | |||
2165 | ret = btrfs_del_inode_ref(trans, root, name, name_len, | ||
2166 | inode->i_ino, | ||
2167 | dir->i_ino, &index); | ||
2168 | if (ret) { | ||
2169 | printk(KERN_INFO "btrfs failed to delete reference to %.*s, " | ||
2170 | "inode %lu parent %lu\n", name_len, name, | ||
2171 | inode->i_ino, dir->i_ino); | ||
2172 | goto err; | ||
2173 | } | ||
2174 | |||
2175 | di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, | ||
2176 | index, name, name_len, -1); | ||
2177 | if (IS_ERR(di)) { | ||
2178 | ret = PTR_ERR(di); | ||
2179 | goto err; | ||
2180 | } | ||
2181 | if (!di) { | ||
2182 | ret = -ENOENT; | ||
2183 | goto err; | ||
2184 | } | ||
2185 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
2186 | btrfs_release_path(root, path); | ||
2187 | |||
2188 | ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, | ||
2189 | inode, dir->i_ino); | ||
2190 | BUG_ON(ret != 0 && ret != -ENOENT); | ||
2191 | if (ret != -ENOENT) | ||
2192 | BTRFS_I(dir)->log_dirty_trans = trans->transid; | ||
2193 | |||
2194 | ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, | ||
2195 | dir, index); | ||
2196 | BUG_ON(ret); | ||
2197 | err: | ||
2198 | btrfs_free_path(path); | ||
2199 | if (ret) | ||
2200 | goto out; | ||
2201 | |||
2202 | btrfs_i_size_write(dir, dir->i_size - name_len * 2); | ||
2203 | inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; | ||
2204 | btrfs_update_inode(trans, root, dir); | ||
2205 | btrfs_drop_nlink(inode); | ||
2206 | ret = btrfs_update_inode(trans, root, inode); | ||
2207 | dir->i_sb->s_dirt = 1; | ||
2208 | out: | ||
2209 | return ret; | ||
2210 | } | ||
2211 | |||
2212 | static int btrfs_unlink(struct inode *dir, struct dentry *dentry) | ||
2213 | { | ||
2214 | struct btrfs_root *root; | ||
2215 | struct btrfs_trans_handle *trans; | ||
2216 | struct inode *inode = dentry->d_inode; | ||
2217 | int ret; | ||
2218 | unsigned long nr = 0; | ||
2219 | |||
2220 | root = BTRFS_I(dir)->root; | ||
2221 | |||
2222 | ret = btrfs_check_free_space(root, 1, 1); | ||
2223 | if (ret) | ||
2224 | goto fail; | ||
2225 | |||
2226 | trans = btrfs_start_transaction(root, 1); | ||
2227 | |||
2228 | btrfs_set_trans_block_group(trans, dir); | ||
2229 | ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, | ||
2230 | dentry->d_name.name, dentry->d_name.len); | ||
2231 | |||
2232 | if (inode->i_nlink == 0) | ||
2233 | ret = btrfs_orphan_add(trans, inode); | ||
2234 | |||
2235 | nr = trans->blocks_used; | ||
2236 | |||
2237 | btrfs_end_transaction_throttle(trans, root); | ||
2238 | fail: | ||
2239 | btrfs_btree_balance_dirty(root, nr); | ||
2240 | return ret; | ||
2241 | } | ||
2242 | |||
2243 | static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) | ||
2244 | { | ||
2245 | struct inode *inode = dentry->d_inode; | ||
2246 | int err = 0; | ||
2247 | int ret; | ||
2248 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
2249 | struct btrfs_trans_handle *trans; | ||
2250 | unsigned long nr = 0; | ||
2251 | |||
2252 | /* | ||
2253 | * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir | ||
2254 | * the root of a subvolume or snapshot | ||
2255 | */ | ||
2256 | if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || | ||
2257 | inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { | ||
2258 | return -ENOTEMPTY; | ||
2259 | } | ||
2260 | |||
2261 | ret = btrfs_check_free_space(root, 1, 1); | ||
2262 | if (ret) | ||
2263 | goto fail; | ||
2264 | |||
2265 | trans = btrfs_start_transaction(root, 1); | ||
2266 | btrfs_set_trans_block_group(trans, dir); | ||
2267 | |||
2268 | err = btrfs_orphan_add(trans, inode); | ||
2269 | if (err) | ||
2270 | goto fail_trans; | ||
2271 | |||
2272 | /* now the directory is empty */ | ||
2273 | err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, | ||
2274 | dentry->d_name.name, dentry->d_name.len); | ||
2275 | if (!err) | ||
2276 | btrfs_i_size_write(inode, 0); | ||
2277 | |||
2278 | fail_trans: | ||
2279 | nr = trans->blocks_used; | ||
2280 | ret = btrfs_end_transaction_throttle(trans, root); | ||
2281 | fail: | ||
2282 | btrfs_btree_balance_dirty(root, nr); | ||
2283 | |||
2284 | if (ret && !err) | ||
2285 | err = ret; | ||
2286 | return err; | ||
2287 | } | ||
2288 | |||
2289 | #if 0 | ||
2290 | /* | ||
2291 | * when truncating bytes in a file, it is possible to avoid reading | ||
2292 | * the leaves that contain only checksum items. This can be the | ||
2293 | * majority of the IO required to delete a large file, but it must | ||
2294 | * be done carefully. | ||
2295 | * | ||
2296 | * The keys in the level just above the leaves are checked to make sure | ||
2297 | * the lowest key in a given leaf is a csum key, and starts at an offset | ||
2298 | * after the new size. | ||
2299 | * | ||
2300 | * Then the key for the next leaf is checked to make sure it also has | ||
2301 | * a checksum item for the same file. If it does, we know our target leaf | ||
2302 | * contains only checksum items, and it can be safely freed without reading | ||
2303 | * it. | ||
2304 | * | ||
2305 | * This is just an optimization targeted at large files. It may do | ||
2306 | * nothing. It will return 0 unless things went badly. | ||
2307 | */ | ||
2308 | static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, | ||
2309 | struct btrfs_root *root, | ||
2310 | struct btrfs_path *path, | ||
2311 | struct inode *inode, u64 new_size) | ||
2312 | { | ||
2313 | struct btrfs_key key; | ||
2314 | int ret; | ||
2315 | int nritems; | ||
2316 | struct btrfs_key found_key; | ||
2317 | struct btrfs_key other_key; | ||
2318 | struct btrfs_leaf_ref *ref; | ||
2319 | u64 leaf_gen; | ||
2320 | u64 leaf_start; | ||
2321 | |||
2322 | path->lowest_level = 1; | ||
2323 | key.objectid = inode->i_ino; | ||
2324 | key.type = BTRFS_CSUM_ITEM_KEY; | ||
2325 | key.offset = new_size; | ||
2326 | again: | ||
2327 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
2328 | if (ret < 0) | ||
2329 | goto out; | ||
2330 | |||
2331 | if (path->nodes[1] == NULL) { | ||
2332 | ret = 0; | ||
2333 | goto out; | ||
2334 | } | ||
2335 | ret = 0; | ||
2336 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); | ||
2337 | nritems = btrfs_header_nritems(path->nodes[1]); | ||
2338 | |||
2339 | if (!nritems) | ||
2340 | goto out; | ||
2341 | |||
2342 | if (path->slots[1] >= nritems) | ||
2343 | goto next_node; | ||
2344 | |||
2345 | /* did we find a key greater than anything we want to delete? */ | ||
2346 | if (found_key.objectid > inode->i_ino || | ||
2347 | (found_key.objectid == inode->i_ino && found_key.type > key.type)) | ||
2348 | goto out; | ||
2349 | |||
2350 | /* we check the next key in the node to make sure the leave contains | ||
2351 | * only checksum items. This comparison doesn't work if our | ||
2352 | * leaf is the last one in the node | ||
2353 | */ | ||
2354 | if (path->slots[1] + 1 >= nritems) { | ||
2355 | next_node: | ||
2356 | /* search forward from the last key in the node, this | ||
2357 | * will bring us into the next node in the tree | ||
2358 | */ | ||
2359 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); | ||
2360 | |||
2361 | /* unlikely, but we inc below, so check to be safe */ | ||
2362 | if (found_key.offset == (u64)-1) | ||
2363 | goto out; | ||
2364 | |||
2365 | /* search_forward needs a path with locks held, do the | ||
2366 | * search again for the original key. It is possible | ||
2367 | * this will race with a balance and return a path that | ||
2368 | * we could modify, but this drop is just an optimization | ||
2369 | * and is allowed to miss some leaves. | ||
2370 | */ | ||
2371 | btrfs_release_path(root, path); | ||
2372 | found_key.offset++; | ||
2373 | |||
2374 | /* setup a max key for search_forward */ | ||
2375 | other_key.offset = (u64)-1; | ||
2376 | other_key.type = key.type; | ||
2377 | other_key.objectid = key.objectid; | ||
2378 | |||
2379 | path->keep_locks = 1; | ||
2380 | ret = btrfs_search_forward(root, &found_key, &other_key, | ||
2381 | path, 0, 0); | ||
2382 | path->keep_locks = 0; | ||
2383 | if (ret || found_key.objectid != key.objectid || | ||
2384 | found_key.type != key.type) { | ||
2385 | ret = 0; | ||
2386 | goto out; | ||
2387 | } | ||
2388 | |||
2389 | key.offset = found_key.offset; | ||
2390 | btrfs_release_path(root, path); | ||
2391 | cond_resched(); | ||
2392 | goto again; | ||
2393 | } | ||
2394 | |||
2395 | /* we know there's one more slot after us in the tree, | ||
2396 | * read that key so we can verify it is also a checksum item | ||
2397 | */ | ||
2398 | btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); | ||
2399 | |||
2400 | if (found_key.objectid < inode->i_ino) | ||
2401 | goto next_key; | ||
2402 | |||
2403 | if (found_key.type != key.type || found_key.offset < new_size) | ||
2404 | goto next_key; | ||
2405 | |||
2406 | /* | ||
2407 | * if the key for the next leaf isn't a csum key from this objectid, | ||
2408 | * we can't be sure there aren't good items inside this leaf. | ||
2409 | * Bail out | ||
2410 | */ | ||
2411 | if (other_key.objectid != inode->i_ino || other_key.type != key.type) | ||
2412 | goto out; | ||
2413 | |||
2414 | leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); | ||
2415 | leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); | ||
2416 | /* | ||
2417 | * it is safe to delete this leaf, it contains only | ||
2418 | * csum items from this inode at an offset >= new_size | ||
2419 | */ | ||
2420 | ret = btrfs_del_leaf(trans, root, path, leaf_start); | ||
2421 | BUG_ON(ret); | ||
2422 | |||
2423 | if (root->ref_cows && leaf_gen < trans->transid) { | ||
2424 | ref = btrfs_alloc_leaf_ref(root, 0); | ||
2425 | if (ref) { | ||
2426 | ref->root_gen = root->root_key.offset; | ||
2427 | ref->bytenr = leaf_start; | ||
2428 | ref->owner = 0; | ||
2429 | ref->generation = leaf_gen; | ||
2430 | ref->nritems = 0; | ||
2431 | |||
2432 | ret = btrfs_add_leaf_ref(root, ref, 0); | ||
2433 | WARN_ON(ret); | ||
2434 | btrfs_free_leaf_ref(root, ref); | ||
2435 | } else { | ||
2436 | WARN_ON(1); | ||
2437 | } | ||
2438 | } | ||
2439 | next_key: | ||
2440 | btrfs_release_path(root, path); | ||
2441 | |||
2442 | if (other_key.objectid == inode->i_ino && | ||
2443 | other_key.type == key.type && other_key.offset > key.offset) { | ||
2444 | key.offset = other_key.offset; | ||
2445 | cond_resched(); | ||
2446 | goto again; | ||
2447 | } | ||
2448 | ret = 0; | ||
2449 | out: | ||
2450 | /* fixup any changes we've made to the path */ | ||
2451 | path->lowest_level = 0; | ||
2452 | path->keep_locks = 0; | ||
2453 | btrfs_release_path(root, path); | ||
2454 | return ret; | ||
2455 | } | ||
2456 | |||
2457 | #endif | ||
2458 | |||
2459 | /* | ||
2460 | * this can truncate away extent items, csum items and directory items. | ||
2461 | * It starts at a high offset and removes keys until it can't find | ||
2462 | * any higher than new_size | ||
2463 | * | ||
2464 | * csum items that cross the new i_size are truncated to the new size | ||
2465 | * as well. | ||
2466 | * | ||
2467 | * min_type is the minimum key type to truncate down to. If set to 0, this | ||
2468 | * will kill all the items on this inode, including the INODE_ITEM_KEY. | ||
2469 | */ | ||
2470 | noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | ||
2471 | struct btrfs_root *root, | ||
2472 | struct inode *inode, | ||
2473 | u64 new_size, u32 min_type) | ||
2474 | { | ||
2475 | int ret; | ||
2476 | struct btrfs_path *path; | ||
2477 | struct btrfs_key key; | ||
2478 | struct btrfs_key found_key; | ||
2479 | u32 found_type; | ||
2480 | struct extent_buffer *leaf; | ||
2481 | struct btrfs_file_extent_item *fi; | ||
2482 | u64 extent_start = 0; | ||
2483 | u64 extent_num_bytes = 0; | ||
2484 | u64 item_end = 0; | ||
2485 | u64 root_gen = 0; | ||
2486 | u64 root_owner = 0; | ||
2487 | int found_extent; | ||
2488 | int del_item; | ||
2489 | int pending_del_nr = 0; | ||
2490 | int pending_del_slot = 0; | ||
2491 | int extent_type = -1; | ||
2492 | int encoding; | ||
2493 | u64 mask = root->sectorsize - 1; | ||
2494 | |||
2495 | if (root->ref_cows) | ||
2496 | btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); | ||
2497 | path = btrfs_alloc_path(); | ||
2498 | path->reada = -1; | ||
2499 | BUG_ON(!path); | ||
2500 | |||
2501 | /* FIXME, add redo link to tree so we don't leak on crash */ | ||
2502 | key.objectid = inode->i_ino; | ||
2503 | key.offset = (u64)-1; | ||
2504 | key.type = (u8)-1; | ||
2505 | |||
2506 | btrfs_init_path(path); | ||
2507 | |||
2508 | search_again: | ||
2509 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
2510 | if (ret < 0) | ||
2511 | goto error; | ||
2512 | |||
2513 | if (ret > 0) { | ||
2514 | /* there are no items in the tree for us to truncate, we're | ||
2515 | * done | ||
2516 | */ | ||
2517 | if (path->slots[0] == 0) { | ||
2518 | ret = 0; | ||
2519 | goto error; | ||
2520 | } | ||
2521 | path->slots[0]--; | ||
2522 | } | ||
2523 | |||
2524 | while (1) { | ||
2525 | fi = NULL; | ||
2526 | leaf = path->nodes[0]; | ||
2527 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
2528 | found_type = btrfs_key_type(&found_key); | ||
2529 | encoding = 0; | ||
2530 | |||
2531 | if (found_key.objectid != inode->i_ino) | ||
2532 | break; | ||
2533 | |||
2534 | if (found_type < min_type) | ||
2535 | break; | ||
2536 | |||
2537 | item_end = found_key.offset; | ||
2538 | if (found_type == BTRFS_EXTENT_DATA_KEY) { | ||
2539 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
2540 | struct btrfs_file_extent_item); | ||
2541 | extent_type = btrfs_file_extent_type(leaf, fi); | ||
2542 | encoding = btrfs_file_extent_compression(leaf, fi); | ||
2543 | encoding |= btrfs_file_extent_encryption(leaf, fi); | ||
2544 | encoding |= btrfs_file_extent_other_encoding(leaf, fi); | ||
2545 | |||
2546 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { | ||
2547 | item_end += | ||
2548 | btrfs_file_extent_num_bytes(leaf, fi); | ||
2549 | } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { | ||
2550 | item_end += btrfs_file_extent_inline_len(leaf, | ||
2551 | fi); | ||
2552 | } | ||
2553 | item_end--; | ||
2554 | } | ||
2555 | if (item_end < new_size) { | ||
2556 | if (found_type == BTRFS_DIR_ITEM_KEY) | ||
2557 | found_type = BTRFS_INODE_ITEM_KEY; | ||
2558 | else if (found_type == BTRFS_EXTENT_ITEM_KEY) | ||
2559 | found_type = BTRFS_EXTENT_DATA_KEY; | ||
2560 | else if (found_type == BTRFS_EXTENT_DATA_KEY) | ||
2561 | found_type = BTRFS_XATTR_ITEM_KEY; | ||
2562 | else if (found_type == BTRFS_XATTR_ITEM_KEY) | ||
2563 | found_type = BTRFS_INODE_REF_KEY; | ||
2564 | else if (found_type) | ||
2565 | found_type--; | ||
2566 | else | ||
2567 | break; | ||
2568 | btrfs_set_key_type(&key, found_type); | ||
2569 | goto next; | ||
2570 | } | ||
2571 | if (found_key.offset >= new_size) | ||
2572 | del_item = 1; | ||
2573 | else | ||
2574 | del_item = 0; | ||
2575 | found_extent = 0; | ||
2576 | |||
2577 | /* FIXME, shrink the extent if the ref count is only 1 */ | ||
2578 | if (found_type != BTRFS_EXTENT_DATA_KEY) | ||
2579 | goto delete; | ||
2580 | |||
2581 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { | ||
2582 | u64 num_dec; | ||
2583 | extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); | ||
2584 | if (!del_item && !encoding) { | ||
2585 | u64 orig_num_bytes = | ||
2586 | btrfs_file_extent_num_bytes(leaf, fi); | ||
2587 | extent_num_bytes = new_size - | ||
2588 | found_key.offset + root->sectorsize - 1; | ||
2589 | extent_num_bytes = extent_num_bytes & | ||
2590 | ~((u64)root->sectorsize - 1); | ||
2591 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
2592 | extent_num_bytes); | ||
2593 | num_dec = (orig_num_bytes - | ||
2594 | extent_num_bytes); | ||
2595 | if (root->ref_cows && extent_start != 0) | ||
2596 | inode_sub_bytes(inode, num_dec); | ||
2597 | btrfs_mark_buffer_dirty(leaf); | ||
2598 | } else { | ||
2599 | extent_num_bytes = | ||
2600 | btrfs_file_extent_disk_num_bytes(leaf, | ||
2601 | fi); | ||
2602 | /* FIXME blocksize != 4096 */ | ||
2603 | num_dec = btrfs_file_extent_num_bytes(leaf, fi); | ||
2604 | if (extent_start != 0) { | ||
2605 | found_extent = 1; | ||
2606 | if (root->ref_cows) | ||
2607 | inode_sub_bytes(inode, num_dec); | ||
2608 | } | ||
2609 | root_gen = btrfs_header_generation(leaf); | ||
2610 | root_owner = btrfs_header_owner(leaf); | ||
2611 | } | ||
2612 | } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { | ||
2613 | /* | ||
2614 | * we can't truncate inline items that have had | ||
2615 | * special encodings | ||
2616 | */ | ||
2617 | if (!del_item && | ||
2618 | btrfs_file_extent_compression(leaf, fi) == 0 && | ||
2619 | btrfs_file_extent_encryption(leaf, fi) == 0 && | ||
2620 | btrfs_file_extent_other_encoding(leaf, fi) == 0) { | ||
2621 | u32 size = new_size - found_key.offset; | ||
2622 | |||
2623 | if (root->ref_cows) { | ||
2624 | inode_sub_bytes(inode, item_end + 1 - | ||
2625 | new_size); | ||
2626 | } | ||
2627 | size = | ||
2628 | btrfs_file_extent_calc_inline_size(size); | ||
2629 | ret = btrfs_truncate_item(trans, root, path, | ||
2630 | size, 1); | ||
2631 | BUG_ON(ret); | ||
2632 | } else if (root->ref_cows) { | ||
2633 | inode_sub_bytes(inode, item_end + 1 - | ||
2634 | found_key.offset); | ||
2635 | } | ||
2636 | } | ||
2637 | delete: | ||
2638 | if (del_item) { | ||
2639 | if (!pending_del_nr) { | ||
2640 | /* no pending yet, add ourselves */ | ||
2641 | pending_del_slot = path->slots[0]; | ||
2642 | pending_del_nr = 1; | ||
2643 | } else if (pending_del_nr && | ||
2644 | path->slots[0] + 1 == pending_del_slot) { | ||
2645 | /* hop on the pending chunk */ | ||
2646 | pending_del_nr++; | ||
2647 | pending_del_slot = path->slots[0]; | ||
2648 | } else { | ||
2649 | BUG(); | ||
2650 | } | ||
2651 | } else { | ||
2652 | break; | ||
2653 | } | ||
2654 | if (found_extent) { | ||
2655 | ret = btrfs_free_extent(trans, root, extent_start, | ||
2656 | extent_num_bytes, | ||
2657 | leaf->start, root_owner, | ||
2658 | root_gen, inode->i_ino, 0); | ||
2659 | BUG_ON(ret); | ||
2660 | } | ||
2661 | next: | ||
2662 | if (path->slots[0] == 0) { | ||
2663 | if (pending_del_nr) | ||
2664 | goto del_pending; | ||
2665 | btrfs_release_path(root, path); | ||
2666 | goto search_again; | ||
2667 | } | ||
2668 | |||
2669 | path->slots[0]--; | ||
2670 | if (pending_del_nr && | ||
2671 | path->slots[0] + 1 != pending_del_slot) { | ||
2672 | struct btrfs_key debug; | ||
2673 | del_pending: | ||
2674 | btrfs_item_key_to_cpu(path->nodes[0], &debug, | ||
2675 | pending_del_slot); | ||
2676 | ret = btrfs_del_items(trans, root, path, | ||
2677 | pending_del_slot, | ||
2678 | pending_del_nr); | ||
2679 | BUG_ON(ret); | ||
2680 | pending_del_nr = 0; | ||
2681 | btrfs_release_path(root, path); | ||
2682 | goto search_again; | ||
2683 | } | ||
2684 | } | ||
2685 | ret = 0; | ||
2686 | error: | ||
2687 | if (pending_del_nr) { | ||
2688 | ret = btrfs_del_items(trans, root, path, pending_del_slot, | ||
2689 | pending_del_nr); | ||
2690 | } | ||
2691 | btrfs_free_path(path); | ||
2692 | inode->i_sb->s_dirt = 1; | ||
2693 | return ret; | ||
2694 | } | ||
2695 | |||
2696 | /* | ||
2697 | * taken from block_truncate_page, but does cow as it zeros out | ||
2698 | * any bytes left in the last page in the file. | ||
2699 | */ | ||
2700 | static int btrfs_truncate_page(struct address_space *mapping, loff_t from) | ||
2701 | { | ||
2702 | struct inode *inode = mapping->host; | ||
2703 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2704 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
2705 | struct btrfs_ordered_extent *ordered; | ||
2706 | char *kaddr; | ||
2707 | u32 blocksize = root->sectorsize; | ||
2708 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | ||
2709 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | ||
2710 | struct page *page; | ||
2711 | int ret = 0; | ||
2712 | u64 page_start; | ||
2713 | u64 page_end; | ||
2714 | |||
2715 | if ((offset & (blocksize - 1)) == 0) | ||
2716 | goto out; | ||
2717 | |||
2718 | ret = -ENOMEM; | ||
2719 | again: | ||
2720 | page = grab_cache_page(mapping, index); | ||
2721 | if (!page) | ||
2722 | goto out; | ||
2723 | |||
2724 | page_start = page_offset(page); | ||
2725 | page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
2726 | |||
2727 | if (!PageUptodate(page)) { | ||
2728 | ret = btrfs_readpage(NULL, page); | ||
2729 | lock_page(page); | ||
2730 | if (page->mapping != mapping) { | ||
2731 | unlock_page(page); | ||
2732 | page_cache_release(page); | ||
2733 | goto again; | ||
2734 | } | ||
2735 | if (!PageUptodate(page)) { | ||
2736 | ret = -EIO; | ||
2737 | goto out_unlock; | ||
2738 | } | ||
2739 | } | ||
2740 | wait_on_page_writeback(page); | ||
2741 | |||
2742 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
2743 | set_page_extent_mapped(page); | ||
2744 | |||
2745 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | ||
2746 | if (ordered) { | ||
2747 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
2748 | unlock_page(page); | ||
2749 | page_cache_release(page); | ||
2750 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
2751 | btrfs_put_ordered_extent(ordered); | ||
2752 | goto again; | ||
2753 | } | ||
2754 | |||
2755 | btrfs_set_extent_delalloc(inode, page_start, page_end); | ||
2756 | ret = 0; | ||
2757 | if (offset != PAGE_CACHE_SIZE) { | ||
2758 | kaddr = kmap(page); | ||
2759 | memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); | ||
2760 | flush_dcache_page(page); | ||
2761 | kunmap(page); | ||
2762 | } | ||
2763 | ClearPageChecked(page); | ||
2764 | set_page_dirty(page); | ||
2765 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
2766 | |||
2767 | out_unlock: | ||
2768 | unlock_page(page); | ||
2769 | page_cache_release(page); | ||
2770 | out: | ||
2771 | return ret; | ||
2772 | } | ||
2773 | |||
2774 | int btrfs_cont_expand(struct inode *inode, loff_t size) | ||
2775 | { | ||
2776 | struct btrfs_trans_handle *trans; | ||
2777 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2778 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
2779 | struct extent_map *em; | ||
2780 | u64 mask = root->sectorsize - 1; | ||
2781 | u64 hole_start = (inode->i_size + mask) & ~mask; | ||
2782 | u64 block_end = (size + mask) & ~mask; | ||
2783 | u64 last_byte; | ||
2784 | u64 cur_offset; | ||
2785 | u64 hole_size; | ||
2786 | int err; | ||
2787 | |||
2788 | if (size <= hole_start) | ||
2789 | return 0; | ||
2790 | |||
2791 | err = btrfs_check_free_space(root, 1, 0); | ||
2792 | if (err) | ||
2793 | return err; | ||
2794 | |||
2795 | btrfs_truncate_page(inode->i_mapping, inode->i_size); | ||
2796 | |||
2797 | while (1) { | ||
2798 | struct btrfs_ordered_extent *ordered; | ||
2799 | btrfs_wait_ordered_range(inode, hole_start, | ||
2800 | block_end - hole_start); | ||
2801 | lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); | ||
2802 | ordered = btrfs_lookup_ordered_extent(inode, hole_start); | ||
2803 | if (!ordered) | ||
2804 | break; | ||
2805 | unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); | ||
2806 | btrfs_put_ordered_extent(ordered); | ||
2807 | } | ||
2808 | |||
2809 | trans = btrfs_start_transaction(root, 1); | ||
2810 | btrfs_set_trans_block_group(trans, inode); | ||
2811 | |||
2812 | cur_offset = hole_start; | ||
2813 | while (1) { | ||
2814 | em = btrfs_get_extent(inode, NULL, 0, cur_offset, | ||
2815 | block_end - cur_offset, 0); | ||
2816 | BUG_ON(IS_ERR(em) || !em); | ||
2817 | last_byte = min(extent_map_end(em), block_end); | ||
2818 | last_byte = (last_byte + mask) & ~mask; | ||
2819 | if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { | ||
2820 | u64 hint_byte = 0; | ||
2821 | hole_size = last_byte - cur_offset; | ||
2822 | err = btrfs_drop_extents(trans, root, inode, | ||
2823 | cur_offset, | ||
2824 | cur_offset + hole_size, | ||
2825 | cur_offset, &hint_byte); | ||
2826 | if (err) | ||
2827 | break; | ||
2828 | err = btrfs_insert_file_extent(trans, root, | ||
2829 | inode->i_ino, cur_offset, 0, | ||
2830 | 0, hole_size, 0, hole_size, | ||
2831 | 0, 0, 0); | ||
2832 | btrfs_drop_extent_cache(inode, hole_start, | ||
2833 | last_byte - 1, 0); | ||
2834 | } | ||
2835 | free_extent_map(em); | ||
2836 | cur_offset = last_byte; | ||
2837 | if (err || cur_offset >= block_end) | ||
2838 | break; | ||
2839 | } | ||
2840 | |||
2841 | btrfs_end_transaction(trans, root); | ||
2842 | unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); | ||
2843 | return err; | ||
2844 | } | ||
2845 | |||
2846 | static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) | ||
2847 | { | ||
2848 | struct inode *inode = dentry->d_inode; | ||
2849 | int err; | ||
2850 | |||
2851 | err = inode_change_ok(inode, attr); | ||
2852 | if (err) | ||
2853 | return err; | ||
2854 | |||
2855 | if (S_ISREG(inode->i_mode) && | ||
2856 | attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { | ||
2857 | err = btrfs_cont_expand(inode, attr->ia_size); | ||
2858 | if (err) | ||
2859 | return err; | ||
2860 | } | ||
2861 | |||
2862 | err = inode_setattr(inode, attr); | ||
2863 | |||
2864 | if (!err && ((attr->ia_valid & ATTR_MODE))) | ||
2865 | err = btrfs_acl_chmod(inode); | ||
2866 | return err; | ||
2867 | } | ||
2868 | |||
2869 | void btrfs_delete_inode(struct inode *inode) | ||
2870 | { | ||
2871 | struct btrfs_trans_handle *trans; | ||
2872 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
2873 | unsigned long nr; | ||
2874 | int ret; | ||
2875 | |||
2876 | truncate_inode_pages(&inode->i_data, 0); | ||
2877 | if (is_bad_inode(inode)) { | ||
2878 | btrfs_orphan_del(NULL, inode); | ||
2879 | goto no_delete; | ||
2880 | } | ||
2881 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
2882 | |||
2883 | btrfs_i_size_write(inode, 0); | ||
2884 | trans = btrfs_join_transaction(root, 1); | ||
2885 | |||
2886 | btrfs_set_trans_block_group(trans, inode); | ||
2887 | ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); | ||
2888 | if (ret) { | ||
2889 | btrfs_orphan_del(NULL, inode); | ||
2890 | goto no_delete_lock; | ||
2891 | } | ||
2892 | |||
2893 | btrfs_orphan_del(trans, inode); | ||
2894 | |||
2895 | nr = trans->blocks_used; | ||
2896 | clear_inode(inode); | ||
2897 | |||
2898 | btrfs_end_transaction(trans, root); | ||
2899 | btrfs_btree_balance_dirty(root, nr); | ||
2900 | return; | ||
2901 | |||
2902 | no_delete_lock: | ||
2903 | nr = trans->blocks_used; | ||
2904 | btrfs_end_transaction(trans, root); | ||
2905 | btrfs_btree_balance_dirty(root, nr); | ||
2906 | no_delete: | ||
2907 | clear_inode(inode); | ||
2908 | } | ||
2909 | |||
2910 | /* | ||
2911 | * this returns the key found in the dir entry in the location pointer. | ||
2912 | * If no dir entries were found, location->objectid is 0. | ||
2913 | */ | ||
2914 | static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, | ||
2915 | struct btrfs_key *location) | ||
2916 | { | ||
2917 | const char *name = dentry->d_name.name; | ||
2918 | int namelen = dentry->d_name.len; | ||
2919 | struct btrfs_dir_item *di; | ||
2920 | struct btrfs_path *path; | ||
2921 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
2922 | int ret = 0; | ||
2923 | |||
2924 | path = btrfs_alloc_path(); | ||
2925 | BUG_ON(!path); | ||
2926 | |||
2927 | di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, | ||
2928 | namelen, 0); | ||
2929 | if (IS_ERR(di)) | ||
2930 | ret = PTR_ERR(di); | ||
2931 | |||
2932 | if (!di || IS_ERR(di)) | ||
2933 | goto out_err; | ||
2934 | |||
2935 | btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); | ||
2936 | out: | ||
2937 | btrfs_free_path(path); | ||
2938 | return ret; | ||
2939 | out_err: | ||
2940 | location->objectid = 0; | ||
2941 | goto out; | ||
2942 | } | ||
2943 | |||
2944 | /* | ||
2945 | * when we hit a tree root in a directory, the btrfs part of the inode | ||
2946 | * needs to be changed to reflect the root directory of the tree root. This | ||
2947 | * is kind of like crossing a mount point. | ||
2948 | */ | ||
2949 | static int fixup_tree_root_location(struct btrfs_root *root, | ||
2950 | struct btrfs_key *location, | ||
2951 | struct btrfs_root **sub_root, | ||
2952 | struct dentry *dentry) | ||
2953 | { | ||
2954 | struct btrfs_root_item *ri; | ||
2955 | |||
2956 | if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) | ||
2957 | return 0; | ||
2958 | if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) | ||
2959 | return 0; | ||
2960 | |||
2961 | *sub_root = btrfs_read_fs_root(root->fs_info, location, | ||
2962 | dentry->d_name.name, | ||
2963 | dentry->d_name.len); | ||
2964 | if (IS_ERR(*sub_root)) | ||
2965 | return PTR_ERR(*sub_root); | ||
2966 | |||
2967 | ri = &(*sub_root)->root_item; | ||
2968 | location->objectid = btrfs_root_dirid(ri); | ||
2969 | btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); | ||
2970 | location->offset = 0; | ||
2971 | |||
2972 | return 0; | ||
2973 | } | ||
2974 | |||
2975 | static noinline void init_btrfs_i(struct inode *inode) | ||
2976 | { | ||
2977 | struct btrfs_inode *bi = BTRFS_I(inode); | ||
2978 | |||
2979 | bi->i_acl = NULL; | ||
2980 | bi->i_default_acl = NULL; | ||
2981 | |||
2982 | bi->generation = 0; | ||
2983 | bi->sequence = 0; | ||
2984 | bi->last_trans = 0; | ||
2985 | bi->logged_trans = 0; | ||
2986 | bi->delalloc_bytes = 0; | ||
2987 | bi->disk_i_size = 0; | ||
2988 | bi->flags = 0; | ||
2989 | bi->index_cnt = (u64)-1; | ||
2990 | bi->log_dirty_trans = 0; | ||
2991 | extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); | ||
2992 | extent_io_tree_init(&BTRFS_I(inode)->io_tree, | ||
2993 | inode->i_mapping, GFP_NOFS); | ||
2994 | extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, | ||
2995 | inode->i_mapping, GFP_NOFS); | ||
2996 | INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); | ||
2997 | btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); | ||
2998 | mutex_init(&BTRFS_I(inode)->extent_mutex); | ||
2999 | mutex_init(&BTRFS_I(inode)->log_mutex); | ||
3000 | } | ||
3001 | |||
3002 | static int btrfs_init_locked_inode(struct inode *inode, void *p) | ||
3003 | { | ||
3004 | struct btrfs_iget_args *args = p; | ||
3005 | inode->i_ino = args->ino; | ||
3006 | init_btrfs_i(inode); | ||
3007 | BTRFS_I(inode)->root = args->root; | ||
3008 | return 0; | ||
3009 | } | ||
3010 | |||
3011 | static int btrfs_find_actor(struct inode *inode, void *opaque) | ||
3012 | { | ||
3013 | struct btrfs_iget_args *args = opaque; | ||
3014 | return args->ino == inode->i_ino && | ||
3015 | args->root == BTRFS_I(inode)->root; | ||
3016 | } | ||
3017 | |||
3018 | struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, | ||
3019 | struct btrfs_root *root, int wait) | ||
3020 | { | ||
3021 | struct inode *inode; | ||
3022 | struct btrfs_iget_args args; | ||
3023 | args.ino = objectid; | ||
3024 | args.root = root; | ||
3025 | |||
3026 | if (wait) { | ||
3027 | inode = ilookup5(s, objectid, btrfs_find_actor, | ||
3028 | (void *)&args); | ||
3029 | } else { | ||
3030 | inode = ilookup5_nowait(s, objectid, btrfs_find_actor, | ||
3031 | (void *)&args); | ||
3032 | } | ||
3033 | return inode; | ||
3034 | } | ||
3035 | |||
3036 | struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, | ||
3037 | struct btrfs_root *root) | ||
3038 | { | ||
3039 | struct inode *inode; | ||
3040 | struct btrfs_iget_args args; | ||
3041 | args.ino = objectid; | ||
3042 | args.root = root; | ||
3043 | |||
3044 | inode = iget5_locked(s, objectid, btrfs_find_actor, | ||
3045 | btrfs_init_locked_inode, | ||
3046 | (void *)&args); | ||
3047 | return inode; | ||
3048 | } | ||
3049 | |||
3050 | /* Get an inode object given its location and corresponding root. | ||
3051 | * Returns in *is_new if the inode was read from disk | ||
3052 | */ | ||
3053 | struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, | ||
3054 | struct btrfs_root *root, int *is_new) | ||
3055 | { | ||
3056 | struct inode *inode; | ||
3057 | |||
3058 | inode = btrfs_iget_locked(s, location->objectid, root); | ||
3059 | if (!inode) | ||
3060 | return ERR_PTR(-EACCES); | ||
3061 | |||
3062 | if (inode->i_state & I_NEW) { | ||
3063 | BTRFS_I(inode)->root = root; | ||
3064 | memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); | ||
3065 | btrfs_read_locked_inode(inode); | ||
3066 | unlock_new_inode(inode); | ||
3067 | if (is_new) | ||
3068 | *is_new = 1; | ||
3069 | } else { | ||
3070 | if (is_new) | ||
3071 | *is_new = 0; | ||
3072 | } | ||
3073 | |||
3074 | return inode; | ||
3075 | } | ||
3076 | |||
3077 | struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) | ||
3078 | { | ||
3079 | struct inode *inode; | ||
3080 | struct btrfs_inode *bi = BTRFS_I(dir); | ||
3081 | struct btrfs_root *root = bi->root; | ||
3082 | struct btrfs_root *sub_root = root; | ||
3083 | struct btrfs_key location; | ||
3084 | int ret, new; | ||
3085 | |||
3086 | if (dentry->d_name.len > BTRFS_NAME_LEN) | ||
3087 | return ERR_PTR(-ENAMETOOLONG); | ||
3088 | |||
3089 | ret = btrfs_inode_by_name(dir, dentry, &location); | ||
3090 | |||
3091 | if (ret < 0) | ||
3092 | return ERR_PTR(ret); | ||
3093 | |||
3094 | inode = NULL; | ||
3095 | if (location.objectid) { | ||
3096 | ret = fixup_tree_root_location(root, &location, &sub_root, | ||
3097 | dentry); | ||
3098 | if (ret < 0) | ||
3099 | return ERR_PTR(ret); | ||
3100 | if (ret > 0) | ||
3101 | return ERR_PTR(-ENOENT); | ||
3102 | inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); | ||
3103 | if (IS_ERR(inode)) | ||
3104 | return ERR_CAST(inode); | ||
3105 | } | ||
3106 | return inode; | ||
3107 | } | ||
3108 | |||
3109 | static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, | ||
3110 | struct nameidata *nd) | ||
3111 | { | ||
3112 | struct inode *inode; | ||
3113 | |||
3114 | if (dentry->d_name.len > BTRFS_NAME_LEN) | ||
3115 | return ERR_PTR(-ENAMETOOLONG); | ||
3116 | |||
3117 | inode = btrfs_lookup_dentry(dir, dentry); | ||
3118 | if (IS_ERR(inode)) | ||
3119 | return ERR_CAST(inode); | ||
3120 | |||
3121 | return d_splice_alias(inode, dentry); | ||
3122 | } | ||
3123 | |||
3124 | static unsigned char btrfs_filetype_table[] = { | ||
3125 | DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK | ||
3126 | }; | ||
3127 | |||
3128 | static int btrfs_real_readdir(struct file *filp, void *dirent, | ||
3129 | filldir_t filldir) | ||
3130 | { | ||
3131 | struct inode *inode = filp->f_dentry->d_inode; | ||
3132 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3133 | struct btrfs_item *item; | ||
3134 | struct btrfs_dir_item *di; | ||
3135 | struct btrfs_key key; | ||
3136 | struct btrfs_key found_key; | ||
3137 | struct btrfs_path *path; | ||
3138 | int ret; | ||
3139 | u32 nritems; | ||
3140 | struct extent_buffer *leaf; | ||
3141 | int slot; | ||
3142 | int advance; | ||
3143 | unsigned char d_type; | ||
3144 | int over = 0; | ||
3145 | u32 di_cur; | ||
3146 | u32 di_total; | ||
3147 | u32 di_len; | ||
3148 | int key_type = BTRFS_DIR_INDEX_KEY; | ||
3149 | char tmp_name[32]; | ||
3150 | char *name_ptr; | ||
3151 | int name_len; | ||
3152 | |||
3153 | /* FIXME, use a real flag for deciding about the key type */ | ||
3154 | if (root->fs_info->tree_root == root) | ||
3155 | key_type = BTRFS_DIR_ITEM_KEY; | ||
3156 | |||
3157 | /* special case for "." */ | ||
3158 | if (filp->f_pos == 0) { | ||
3159 | over = filldir(dirent, ".", 1, | ||
3160 | 1, inode->i_ino, | ||
3161 | DT_DIR); | ||
3162 | if (over) | ||
3163 | return 0; | ||
3164 | filp->f_pos = 1; | ||
3165 | } | ||
3166 | /* special case for .., just use the back ref */ | ||
3167 | if (filp->f_pos == 1) { | ||
3168 | u64 pino = parent_ino(filp->f_path.dentry); | ||
3169 | over = filldir(dirent, "..", 2, | ||
3170 | 2, pino, DT_DIR); | ||
3171 | if (over) | ||
3172 | return 0; | ||
3173 | filp->f_pos = 2; | ||
3174 | } | ||
3175 | path = btrfs_alloc_path(); | ||
3176 | path->reada = 2; | ||
3177 | |||
3178 | btrfs_set_key_type(&key, key_type); | ||
3179 | key.offset = filp->f_pos; | ||
3180 | key.objectid = inode->i_ino; | ||
3181 | |||
3182 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3183 | if (ret < 0) | ||
3184 | goto err; | ||
3185 | advance = 0; | ||
3186 | |||
3187 | while (1) { | ||
3188 | leaf = path->nodes[0]; | ||
3189 | nritems = btrfs_header_nritems(leaf); | ||
3190 | slot = path->slots[0]; | ||
3191 | if (advance || slot >= nritems) { | ||
3192 | if (slot >= nritems - 1) { | ||
3193 | ret = btrfs_next_leaf(root, path); | ||
3194 | if (ret) | ||
3195 | break; | ||
3196 | leaf = path->nodes[0]; | ||
3197 | nritems = btrfs_header_nritems(leaf); | ||
3198 | slot = path->slots[0]; | ||
3199 | } else { | ||
3200 | slot++; | ||
3201 | path->slots[0]++; | ||
3202 | } | ||
3203 | } | ||
3204 | |||
3205 | advance = 1; | ||
3206 | item = btrfs_item_nr(leaf, slot); | ||
3207 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
3208 | |||
3209 | if (found_key.objectid != key.objectid) | ||
3210 | break; | ||
3211 | if (btrfs_key_type(&found_key) != key_type) | ||
3212 | break; | ||
3213 | if (found_key.offset < filp->f_pos) | ||
3214 | continue; | ||
3215 | |||
3216 | filp->f_pos = found_key.offset; | ||
3217 | |||
3218 | di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); | ||
3219 | di_cur = 0; | ||
3220 | di_total = btrfs_item_size(leaf, item); | ||
3221 | |||
3222 | while (di_cur < di_total) { | ||
3223 | struct btrfs_key location; | ||
3224 | |||
3225 | name_len = btrfs_dir_name_len(leaf, di); | ||
3226 | if (name_len <= sizeof(tmp_name)) { | ||
3227 | name_ptr = tmp_name; | ||
3228 | } else { | ||
3229 | name_ptr = kmalloc(name_len, GFP_NOFS); | ||
3230 | if (!name_ptr) { | ||
3231 | ret = -ENOMEM; | ||
3232 | goto err; | ||
3233 | } | ||
3234 | } | ||
3235 | read_extent_buffer(leaf, name_ptr, | ||
3236 | (unsigned long)(di + 1), name_len); | ||
3237 | |||
3238 | d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; | ||
3239 | btrfs_dir_item_key_to_cpu(leaf, di, &location); | ||
3240 | |||
3241 | /* is this a reference to our own snapshot? If so | ||
3242 | * skip it | ||
3243 | */ | ||
3244 | if (location.type == BTRFS_ROOT_ITEM_KEY && | ||
3245 | location.objectid == root->root_key.objectid) { | ||
3246 | over = 0; | ||
3247 | goto skip; | ||
3248 | } | ||
3249 | over = filldir(dirent, name_ptr, name_len, | ||
3250 | found_key.offset, location.objectid, | ||
3251 | d_type); | ||
3252 | |||
3253 | skip: | ||
3254 | if (name_ptr != tmp_name) | ||
3255 | kfree(name_ptr); | ||
3256 | |||
3257 | if (over) | ||
3258 | goto nopos; | ||
3259 | di_len = btrfs_dir_name_len(leaf, di) + | ||
3260 | btrfs_dir_data_len(leaf, di) + sizeof(*di); | ||
3261 | di_cur += di_len; | ||
3262 | di = (struct btrfs_dir_item *)((char *)di + di_len); | ||
3263 | } | ||
3264 | } | ||
3265 | |||
3266 | /* Reached end of directory/root. Bump pos past the last item. */ | ||
3267 | if (key_type == BTRFS_DIR_INDEX_KEY) | ||
3268 | filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); | ||
3269 | else | ||
3270 | filp->f_pos++; | ||
3271 | nopos: | ||
3272 | ret = 0; | ||
3273 | err: | ||
3274 | btrfs_free_path(path); | ||
3275 | return ret; | ||
3276 | } | ||
3277 | |||
3278 | int btrfs_write_inode(struct inode *inode, int wait) | ||
3279 | { | ||
3280 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3281 | struct btrfs_trans_handle *trans; | ||
3282 | int ret = 0; | ||
3283 | |||
3284 | if (root->fs_info->btree_inode == inode) | ||
3285 | return 0; | ||
3286 | |||
3287 | if (wait) { | ||
3288 | trans = btrfs_join_transaction(root, 1); | ||
3289 | btrfs_set_trans_block_group(trans, inode); | ||
3290 | ret = btrfs_commit_transaction(trans, root); | ||
3291 | } | ||
3292 | return ret; | ||
3293 | } | ||
3294 | |||
3295 | /* | ||
3296 | * This is somewhat expensive, updating the tree every time the | ||
3297 | * inode changes. But, it is most likely to find the inode in cache. | ||
3298 | * FIXME, needs more benchmarking...there are no reasons other than performance | ||
3299 | * to keep or drop this code. | ||
3300 | */ | ||
3301 | void btrfs_dirty_inode(struct inode *inode) | ||
3302 | { | ||
3303 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3304 | struct btrfs_trans_handle *trans; | ||
3305 | |||
3306 | trans = btrfs_join_transaction(root, 1); | ||
3307 | btrfs_set_trans_block_group(trans, inode); | ||
3308 | btrfs_update_inode(trans, root, inode); | ||
3309 | btrfs_end_transaction(trans, root); | ||
3310 | } | ||
3311 | |||
3312 | /* | ||
3313 | * find the highest existing sequence number in a directory | ||
3314 | * and then set the in-memory index_cnt variable to reflect | ||
3315 | * free sequence numbers | ||
3316 | */ | ||
3317 | static int btrfs_set_inode_index_count(struct inode *inode) | ||
3318 | { | ||
3319 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3320 | struct btrfs_key key, found_key; | ||
3321 | struct btrfs_path *path; | ||
3322 | struct extent_buffer *leaf; | ||
3323 | int ret; | ||
3324 | |||
3325 | key.objectid = inode->i_ino; | ||
3326 | btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); | ||
3327 | key.offset = (u64)-1; | ||
3328 | |||
3329 | path = btrfs_alloc_path(); | ||
3330 | if (!path) | ||
3331 | return -ENOMEM; | ||
3332 | |||
3333 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3334 | if (ret < 0) | ||
3335 | goto out; | ||
3336 | /* FIXME: we should be able to handle this */ | ||
3337 | if (ret == 0) | ||
3338 | goto out; | ||
3339 | ret = 0; | ||
3340 | |||
3341 | /* | ||
3342 | * MAGIC NUMBER EXPLANATION: | ||
3343 | * since we search a directory based on f_pos we have to start at 2 | ||
3344 | * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody | ||
3345 | * else has to start at 2 | ||
3346 | */ | ||
3347 | if (path->slots[0] == 0) { | ||
3348 | BTRFS_I(inode)->index_cnt = 2; | ||
3349 | goto out; | ||
3350 | } | ||
3351 | |||
3352 | path->slots[0]--; | ||
3353 | |||
3354 | leaf = path->nodes[0]; | ||
3355 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
3356 | |||
3357 | if (found_key.objectid != inode->i_ino || | ||
3358 | btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { | ||
3359 | BTRFS_I(inode)->index_cnt = 2; | ||
3360 | goto out; | ||
3361 | } | ||
3362 | |||
3363 | BTRFS_I(inode)->index_cnt = found_key.offset + 1; | ||
3364 | out: | ||
3365 | btrfs_free_path(path); | ||
3366 | return ret; | ||
3367 | } | ||
3368 | |||
3369 | /* | ||
3370 | * helper to find a free sequence number in a given directory. This current | ||
3371 | * code is very simple, later versions will do smarter things in the btree | ||
3372 | */ | ||
3373 | int btrfs_set_inode_index(struct inode *dir, u64 *index) | ||
3374 | { | ||
3375 | int ret = 0; | ||
3376 | |||
3377 | if (BTRFS_I(dir)->index_cnt == (u64)-1) { | ||
3378 | ret = btrfs_set_inode_index_count(dir); | ||
3379 | if (ret) | ||
3380 | return ret; | ||
3381 | } | ||
3382 | |||
3383 | *index = BTRFS_I(dir)->index_cnt; | ||
3384 | BTRFS_I(dir)->index_cnt++; | ||
3385 | |||
3386 | return ret; | ||
3387 | } | ||
3388 | |||
3389 | static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | ||
3390 | struct btrfs_root *root, | ||
3391 | struct inode *dir, | ||
3392 | const char *name, int name_len, | ||
3393 | u64 ref_objectid, u64 objectid, | ||
3394 | u64 alloc_hint, int mode, u64 *index) | ||
3395 | { | ||
3396 | struct inode *inode; | ||
3397 | struct btrfs_inode_item *inode_item; | ||
3398 | struct btrfs_key *location; | ||
3399 | struct btrfs_path *path; | ||
3400 | struct btrfs_inode_ref *ref; | ||
3401 | struct btrfs_key key[2]; | ||
3402 | u32 sizes[2]; | ||
3403 | unsigned long ptr; | ||
3404 | int ret; | ||
3405 | int owner; | ||
3406 | |||
3407 | path = btrfs_alloc_path(); | ||
3408 | BUG_ON(!path); | ||
3409 | |||
3410 | inode = new_inode(root->fs_info->sb); | ||
3411 | if (!inode) | ||
3412 | return ERR_PTR(-ENOMEM); | ||
3413 | |||
3414 | if (dir) { | ||
3415 | ret = btrfs_set_inode_index(dir, index); | ||
3416 | if (ret) | ||
3417 | return ERR_PTR(ret); | ||
3418 | } | ||
3419 | /* | ||
3420 | * index_cnt is ignored for everything but a dir, | ||
3421 | * btrfs_get_inode_index_count has an explanation for the magic | ||
3422 | * number | ||
3423 | */ | ||
3424 | init_btrfs_i(inode); | ||
3425 | BTRFS_I(inode)->index_cnt = 2; | ||
3426 | BTRFS_I(inode)->root = root; | ||
3427 | BTRFS_I(inode)->generation = trans->transid; | ||
3428 | |||
3429 | if (mode & S_IFDIR) | ||
3430 | owner = 0; | ||
3431 | else | ||
3432 | owner = 1; | ||
3433 | BTRFS_I(inode)->block_group = | ||
3434 | btrfs_find_block_group(root, 0, alloc_hint, owner); | ||
3435 | if ((mode & S_IFREG)) { | ||
3436 | if (btrfs_test_opt(root, NODATASUM)) | ||
3437 | btrfs_set_flag(inode, NODATASUM); | ||
3438 | if (btrfs_test_opt(root, NODATACOW)) | ||
3439 | btrfs_set_flag(inode, NODATACOW); | ||
3440 | } | ||
3441 | |||
3442 | key[0].objectid = objectid; | ||
3443 | btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); | ||
3444 | key[0].offset = 0; | ||
3445 | |||
3446 | key[1].objectid = objectid; | ||
3447 | btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); | ||
3448 | key[1].offset = ref_objectid; | ||
3449 | |||
3450 | sizes[0] = sizeof(struct btrfs_inode_item); | ||
3451 | sizes[1] = name_len + sizeof(*ref); | ||
3452 | |||
3453 | ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); | ||
3454 | if (ret != 0) | ||
3455 | goto fail; | ||
3456 | |||
3457 | if (objectid > root->highest_inode) | ||
3458 | root->highest_inode = objectid; | ||
3459 | |||
3460 | inode->i_uid = current_fsuid(); | ||
3461 | inode->i_gid = current_fsgid(); | ||
3462 | inode->i_mode = mode; | ||
3463 | inode->i_ino = objectid; | ||
3464 | inode_set_bytes(inode, 0); | ||
3465 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | ||
3466 | inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
3467 | struct btrfs_inode_item); | ||
3468 | fill_inode_item(trans, path->nodes[0], inode_item, inode); | ||
3469 | |||
3470 | ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, | ||
3471 | struct btrfs_inode_ref); | ||
3472 | btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); | ||
3473 | btrfs_set_inode_ref_index(path->nodes[0], ref, *index); | ||
3474 | ptr = (unsigned long)(ref + 1); | ||
3475 | write_extent_buffer(path->nodes[0], name, ptr, name_len); | ||
3476 | |||
3477 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
3478 | btrfs_free_path(path); | ||
3479 | |||
3480 | location = &BTRFS_I(inode)->location; | ||
3481 | location->objectid = objectid; | ||
3482 | location->offset = 0; | ||
3483 | btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); | ||
3484 | |||
3485 | insert_inode_hash(inode); | ||
3486 | return inode; | ||
3487 | fail: | ||
3488 | if (dir) | ||
3489 | BTRFS_I(dir)->index_cnt--; | ||
3490 | btrfs_free_path(path); | ||
3491 | return ERR_PTR(ret); | ||
3492 | } | ||
3493 | |||
3494 | static inline u8 btrfs_inode_type(struct inode *inode) | ||
3495 | { | ||
3496 | return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; | ||
3497 | } | ||
3498 | |||
3499 | /* | ||
3500 | * utility function to add 'inode' into 'parent_inode' with | ||
3501 | * a give name and a given sequence number. | ||
3502 | * if 'add_backref' is true, also insert a backref from the | ||
3503 | * inode to the parent directory. | ||
3504 | */ | ||
3505 | int btrfs_add_link(struct btrfs_trans_handle *trans, | ||
3506 | struct inode *parent_inode, struct inode *inode, | ||
3507 | const char *name, int name_len, int add_backref, u64 index) | ||
3508 | { | ||
3509 | int ret; | ||
3510 | struct btrfs_key key; | ||
3511 | struct btrfs_root *root = BTRFS_I(parent_inode)->root; | ||
3512 | |||
3513 | key.objectid = inode->i_ino; | ||
3514 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | ||
3515 | key.offset = 0; | ||
3516 | |||
3517 | ret = btrfs_insert_dir_item(trans, root, name, name_len, | ||
3518 | parent_inode->i_ino, | ||
3519 | &key, btrfs_inode_type(inode), | ||
3520 | index); | ||
3521 | if (ret == 0) { | ||
3522 | if (add_backref) { | ||
3523 | ret = btrfs_insert_inode_ref(trans, root, | ||
3524 | name, name_len, | ||
3525 | inode->i_ino, | ||
3526 | parent_inode->i_ino, | ||
3527 | index); | ||
3528 | } | ||
3529 | btrfs_i_size_write(parent_inode, parent_inode->i_size + | ||
3530 | name_len * 2); | ||
3531 | parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; | ||
3532 | ret = btrfs_update_inode(trans, root, parent_inode); | ||
3533 | } | ||
3534 | return ret; | ||
3535 | } | ||
3536 | |||
3537 | static int btrfs_add_nondir(struct btrfs_trans_handle *trans, | ||
3538 | struct dentry *dentry, struct inode *inode, | ||
3539 | int backref, u64 index) | ||
3540 | { | ||
3541 | int err = btrfs_add_link(trans, dentry->d_parent->d_inode, | ||
3542 | inode, dentry->d_name.name, | ||
3543 | dentry->d_name.len, backref, index); | ||
3544 | if (!err) { | ||
3545 | d_instantiate(dentry, inode); | ||
3546 | return 0; | ||
3547 | } | ||
3548 | if (err > 0) | ||
3549 | err = -EEXIST; | ||
3550 | return err; | ||
3551 | } | ||
3552 | |||
3553 | static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | ||
3554 | int mode, dev_t rdev) | ||
3555 | { | ||
3556 | struct btrfs_trans_handle *trans; | ||
3557 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
3558 | struct inode *inode = NULL; | ||
3559 | int err; | ||
3560 | int drop_inode = 0; | ||
3561 | u64 objectid; | ||
3562 | unsigned long nr = 0; | ||
3563 | u64 index = 0; | ||
3564 | |||
3565 | if (!new_valid_dev(rdev)) | ||
3566 | return -EINVAL; | ||
3567 | |||
3568 | err = btrfs_check_free_space(root, 1, 0); | ||
3569 | if (err) | ||
3570 | goto fail; | ||
3571 | |||
3572 | trans = btrfs_start_transaction(root, 1); | ||
3573 | btrfs_set_trans_block_group(trans, dir); | ||
3574 | |||
3575 | err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); | ||
3576 | if (err) { | ||
3577 | err = -ENOSPC; | ||
3578 | goto out_unlock; | ||
3579 | } | ||
3580 | |||
3581 | inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, | ||
3582 | dentry->d_name.len, | ||
3583 | dentry->d_parent->d_inode->i_ino, objectid, | ||
3584 | BTRFS_I(dir)->block_group, mode, &index); | ||
3585 | err = PTR_ERR(inode); | ||
3586 | if (IS_ERR(inode)) | ||
3587 | goto out_unlock; | ||
3588 | |||
3589 | err = btrfs_init_acl(inode, dir); | ||
3590 | if (err) { | ||
3591 | drop_inode = 1; | ||
3592 | goto out_unlock; | ||
3593 | } | ||
3594 | |||
3595 | btrfs_set_trans_block_group(trans, inode); | ||
3596 | err = btrfs_add_nondir(trans, dentry, inode, 0, index); | ||
3597 | if (err) | ||
3598 | drop_inode = 1; | ||
3599 | else { | ||
3600 | inode->i_op = &btrfs_special_inode_operations; | ||
3601 | init_special_inode(inode, inode->i_mode, rdev); | ||
3602 | btrfs_update_inode(trans, root, inode); | ||
3603 | } | ||
3604 | dir->i_sb->s_dirt = 1; | ||
3605 | btrfs_update_inode_block_group(trans, inode); | ||
3606 | btrfs_update_inode_block_group(trans, dir); | ||
3607 | out_unlock: | ||
3608 | nr = trans->blocks_used; | ||
3609 | btrfs_end_transaction_throttle(trans, root); | ||
3610 | fail: | ||
3611 | if (drop_inode) { | ||
3612 | inode_dec_link_count(inode); | ||
3613 | iput(inode); | ||
3614 | } | ||
3615 | btrfs_btree_balance_dirty(root, nr); | ||
3616 | return err; | ||
3617 | } | ||
3618 | |||
3619 | static int btrfs_create(struct inode *dir, struct dentry *dentry, | ||
3620 | int mode, struct nameidata *nd) | ||
3621 | { | ||
3622 | struct btrfs_trans_handle *trans; | ||
3623 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
3624 | struct inode *inode = NULL; | ||
3625 | int err; | ||
3626 | int drop_inode = 0; | ||
3627 | unsigned long nr = 0; | ||
3628 | u64 objectid; | ||
3629 | u64 index = 0; | ||
3630 | |||
3631 | err = btrfs_check_free_space(root, 1, 0); | ||
3632 | if (err) | ||
3633 | goto fail; | ||
3634 | trans = btrfs_start_transaction(root, 1); | ||
3635 | btrfs_set_trans_block_group(trans, dir); | ||
3636 | |||
3637 | err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); | ||
3638 | if (err) { | ||
3639 | err = -ENOSPC; | ||
3640 | goto out_unlock; | ||
3641 | } | ||
3642 | |||
3643 | inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, | ||
3644 | dentry->d_name.len, | ||
3645 | dentry->d_parent->d_inode->i_ino, | ||
3646 | objectid, BTRFS_I(dir)->block_group, mode, | ||
3647 | &index); | ||
3648 | err = PTR_ERR(inode); | ||
3649 | if (IS_ERR(inode)) | ||
3650 | goto out_unlock; | ||
3651 | |||
3652 | err = btrfs_init_acl(inode, dir); | ||
3653 | if (err) { | ||
3654 | drop_inode = 1; | ||
3655 | goto out_unlock; | ||
3656 | } | ||
3657 | |||
3658 | btrfs_set_trans_block_group(trans, inode); | ||
3659 | err = btrfs_add_nondir(trans, dentry, inode, 0, index); | ||
3660 | if (err) | ||
3661 | drop_inode = 1; | ||
3662 | else { | ||
3663 | inode->i_mapping->a_ops = &btrfs_aops; | ||
3664 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | ||
3665 | inode->i_fop = &btrfs_file_operations; | ||
3666 | inode->i_op = &btrfs_file_inode_operations; | ||
3667 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | ||
3668 | } | ||
3669 | dir->i_sb->s_dirt = 1; | ||
3670 | btrfs_update_inode_block_group(trans, inode); | ||
3671 | btrfs_update_inode_block_group(trans, dir); | ||
3672 | out_unlock: | ||
3673 | nr = trans->blocks_used; | ||
3674 | btrfs_end_transaction_throttle(trans, root); | ||
3675 | fail: | ||
3676 | if (drop_inode) { | ||
3677 | inode_dec_link_count(inode); | ||
3678 | iput(inode); | ||
3679 | } | ||
3680 | btrfs_btree_balance_dirty(root, nr); | ||
3681 | return err; | ||
3682 | } | ||
3683 | |||
3684 | static int btrfs_link(struct dentry *old_dentry, struct inode *dir, | ||
3685 | struct dentry *dentry) | ||
3686 | { | ||
3687 | struct btrfs_trans_handle *trans; | ||
3688 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
3689 | struct inode *inode = old_dentry->d_inode; | ||
3690 | u64 index; | ||
3691 | unsigned long nr = 0; | ||
3692 | int err; | ||
3693 | int drop_inode = 0; | ||
3694 | |||
3695 | if (inode->i_nlink == 0) | ||
3696 | return -ENOENT; | ||
3697 | |||
3698 | btrfs_inc_nlink(inode); | ||
3699 | err = btrfs_check_free_space(root, 1, 0); | ||
3700 | if (err) | ||
3701 | goto fail; | ||
3702 | err = btrfs_set_inode_index(dir, &index); | ||
3703 | if (err) | ||
3704 | goto fail; | ||
3705 | |||
3706 | trans = btrfs_start_transaction(root, 1); | ||
3707 | |||
3708 | btrfs_set_trans_block_group(trans, dir); | ||
3709 | atomic_inc(&inode->i_count); | ||
3710 | |||
3711 | err = btrfs_add_nondir(trans, dentry, inode, 1, index); | ||
3712 | |||
3713 | if (err) | ||
3714 | drop_inode = 1; | ||
3715 | |||
3716 | dir->i_sb->s_dirt = 1; | ||
3717 | btrfs_update_inode_block_group(trans, dir); | ||
3718 | err = btrfs_update_inode(trans, root, inode); | ||
3719 | |||
3720 | if (err) | ||
3721 | drop_inode = 1; | ||
3722 | |||
3723 | nr = trans->blocks_used; | ||
3724 | btrfs_end_transaction_throttle(trans, root); | ||
3725 | fail: | ||
3726 | if (drop_inode) { | ||
3727 | inode_dec_link_count(inode); | ||
3728 | iput(inode); | ||
3729 | } | ||
3730 | btrfs_btree_balance_dirty(root, nr); | ||
3731 | return err; | ||
3732 | } | ||
3733 | |||
3734 | static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
3735 | { | ||
3736 | struct inode *inode = NULL; | ||
3737 | struct btrfs_trans_handle *trans; | ||
3738 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
3739 | int err = 0; | ||
3740 | int drop_on_err = 0; | ||
3741 | u64 objectid = 0; | ||
3742 | u64 index = 0; | ||
3743 | unsigned long nr = 1; | ||
3744 | |||
3745 | err = btrfs_check_free_space(root, 1, 0); | ||
3746 | if (err) | ||
3747 | goto out_unlock; | ||
3748 | |||
3749 | trans = btrfs_start_transaction(root, 1); | ||
3750 | btrfs_set_trans_block_group(trans, dir); | ||
3751 | |||
3752 | if (IS_ERR(trans)) { | ||
3753 | err = PTR_ERR(trans); | ||
3754 | goto out_unlock; | ||
3755 | } | ||
3756 | |||
3757 | err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); | ||
3758 | if (err) { | ||
3759 | err = -ENOSPC; | ||
3760 | goto out_unlock; | ||
3761 | } | ||
3762 | |||
3763 | inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, | ||
3764 | dentry->d_name.len, | ||
3765 | dentry->d_parent->d_inode->i_ino, objectid, | ||
3766 | BTRFS_I(dir)->block_group, S_IFDIR | mode, | ||
3767 | &index); | ||
3768 | if (IS_ERR(inode)) { | ||
3769 | err = PTR_ERR(inode); | ||
3770 | goto out_fail; | ||
3771 | } | ||
3772 | |||
3773 | drop_on_err = 1; | ||
3774 | |||
3775 | err = btrfs_init_acl(inode, dir); | ||
3776 | if (err) | ||
3777 | goto out_fail; | ||
3778 | |||
3779 | inode->i_op = &btrfs_dir_inode_operations; | ||
3780 | inode->i_fop = &btrfs_dir_file_operations; | ||
3781 | btrfs_set_trans_block_group(trans, inode); | ||
3782 | |||
3783 | btrfs_i_size_write(inode, 0); | ||
3784 | err = btrfs_update_inode(trans, root, inode); | ||
3785 | if (err) | ||
3786 | goto out_fail; | ||
3787 | |||
3788 | err = btrfs_add_link(trans, dentry->d_parent->d_inode, | ||
3789 | inode, dentry->d_name.name, | ||
3790 | dentry->d_name.len, 0, index); | ||
3791 | if (err) | ||
3792 | goto out_fail; | ||
3793 | |||
3794 | d_instantiate(dentry, inode); | ||
3795 | drop_on_err = 0; | ||
3796 | dir->i_sb->s_dirt = 1; | ||
3797 | btrfs_update_inode_block_group(trans, inode); | ||
3798 | btrfs_update_inode_block_group(trans, dir); | ||
3799 | |||
3800 | out_fail: | ||
3801 | nr = trans->blocks_used; | ||
3802 | btrfs_end_transaction_throttle(trans, root); | ||
3803 | |||
3804 | out_unlock: | ||
3805 | if (drop_on_err) | ||
3806 | iput(inode); | ||
3807 | btrfs_btree_balance_dirty(root, nr); | ||
3808 | return err; | ||
3809 | } | ||
3810 | |||
3811 | /* helper for btfs_get_extent. Given an existing extent in the tree, | ||
3812 | * and an extent that you want to insert, deal with overlap and insert | ||
3813 | * the new extent into the tree. | ||
3814 | */ | ||
3815 | static int merge_extent_mapping(struct extent_map_tree *em_tree, | ||
3816 | struct extent_map *existing, | ||
3817 | struct extent_map *em, | ||
3818 | u64 map_start, u64 map_len) | ||
3819 | { | ||
3820 | u64 start_diff; | ||
3821 | |||
3822 | BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); | ||
3823 | start_diff = map_start - em->start; | ||
3824 | em->start = map_start; | ||
3825 | em->len = map_len; | ||
3826 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | ||
3827 | !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | ||
3828 | em->block_start += start_diff; | ||
3829 | em->block_len -= start_diff; | ||
3830 | } | ||
3831 | return add_extent_mapping(em_tree, em); | ||
3832 | } | ||
3833 | |||
3834 | static noinline int uncompress_inline(struct btrfs_path *path, | ||
3835 | struct inode *inode, struct page *page, | ||
3836 | size_t pg_offset, u64 extent_offset, | ||
3837 | struct btrfs_file_extent_item *item) | ||
3838 | { | ||
3839 | int ret; | ||
3840 | struct extent_buffer *leaf = path->nodes[0]; | ||
3841 | char *tmp; | ||
3842 | size_t max_size; | ||
3843 | unsigned long inline_size; | ||
3844 | unsigned long ptr; | ||
3845 | |||
3846 | WARN_ON(pg_offset != 0); | ||
3847 | max_size = btrfs_file_extent_ram_bytes(leaf, item); | ||
3848 | inline_size = btrfs_file_extent_inline_item_len(leaf, | ||
3849 | btrfs_item_nr(leaf, path->slots[0])); | ||
3850 | tmp = kmalloc(inline_size, GFP_NOFS); | ||
3851 | ptr = btrfs_file_extent_inline_start(item); | ||
3852 | |||
3853 | read_extent_buffer(leaf, tmp, ptr, inline_size); | ||
3854 | |||
3855 | max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); | ||
3856 | ret = btrfs_zlib_decompress(tmp, page, extent_offset, | ||
3857 | inline_size, max_size); | ||
3858 | if (ret) { | ||
3859 | char *kaddr = kmap_atomic(page, KM_USER0); | ||
3860 | unsigned long copy_size = min_t(u64, | ||
3861 | PAGE_CACHE_SIZE - pg_offset, | ||
3862 | max_size - extent_offset); | ||
3863 | memset(kaddr + pg_offset, 0, copy_size); | ||
3864 | kunmap_atomic(kaddr, KM_USER0); | ||
3865 | } | ||
3866 | kfree(tmp); | ||
3867 | return 0; | ||
3868 | } | ||
3869 | |||
3870 | /* | ||
3871 | * a bit scary, this does extent mapping from logical file offset to the disk. | ||
3872 | * the ugly parts come from merging extents from the disk with the in-ram | ||
3873 | * representation. This gets more complex because of the data=ordered code, | ||
3874 | * where the in-ram extents might be locked pending data=ordered completion. | ||
3875 | * | ||
3876 | * This also copies inline extents directly into the page. | ||
3877 | */ | ||
3878 | |||
3879 | struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, | ||
3880 | size_t pg_offset, u64 start, u64 len, | ||
3881 | int create) | ||
3882 | { | ||
3883 | int ret; | ||
3884 | int err = 0; | ||
3885 | u64 bytenr; | ||
3886 | u64 extent_start = 0; | ||
3887 | u64 extent_end = 0; | ||
3888 | u64 objectid = inode->i_ino; | ||
3889 | u32 found_type; | ||
3890 | struct btrfs_path *path = NULL; | ||
3891 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
3892 | struct btrfs_file_extent_item *item; | ||
3893 | struct extent_buffer *leaf; | ||
3894 | struct btrfs_key found_key; | ||
3895 | struct extent_map *em = NULL; | ||
3896 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
3897 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
3898 | struct btrfs_trans_handle *trans = NULL; | ||
3899 | int compressed; | ||
3900 | |||
3901 | again: | ||
3902 | spin_lock(&em_tree->lock); | ||
3903 | em = lookup_extent_mapping(em_tree, start, len); | ||
3904 | if (em) | ||
3905 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
3906 | spin_unlock(&em_tree->lock); | ||
3907 | |||
3908 | if (em) { | ||
3909 | if (em->start > start || em->start + em->len <= start) | ||
3910 | free_extent_map(em); | ||
3911 | else if (em->block_start == EXTENT_MAP_INLINE && page) | ||
3912 | free_extent_map(em); | ||
3913 | else | ||
3914 | goto out; | ||
3915 | } | ||
3916 | em = alloc_extent_map(GFP_NOFS); | ||
3917 | if (!em) { | ||
3918 | err = -ENOMEM; | ||
3919 | goto out; | ||
3920 | } | ||
3921 | em->bdev = root->fs_info->fs_devices->latest_bdev; | ||
3922 | em->start = EXTENT_MAP_HOLE; | ||
3923 | em->orig_start = EXTENT_MAP_HOLE; | ||
3924 | em->len = (u64)-1; | ||
3925 | em->block_len = (u64)-1; | ||
3926 | |||
3927 | if (!path) { | ||
3928 | path = btrfs_alloc_path(); | ||
3929 | BUG_ON(!path); | ||
3930 | } | ||
3931 | |||
3932 | ret = btrfs_lookup_file_extent(trans, root, path, | ||
3933 | objectid, start, trans != NULL); | ||
3934 | if (ret < 0) { | ||
3935 | err = ret; | ||
3936 | goto out; | ||
3937 | } | ||
3938 | |||
3939 | if (ret != 0) { | ||
3940 | if (path->slots[0] == 0) | ||
3941 | goto not_found; | ||
3942 | path->slots[0]--; | ||
3943 | } | ||
3944 | |||
3945 | leaf = path->nodes[0]; | ||
3946 | item = btrfs_item_ptr(leaf, path->slots[0], | ||
3947 | struct btrfs_file_extent_item); | ||
3948 | /* are we inside the extent that was found? */ | ||
3949 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
3950 | found_type = btrfs_key_type(&found_key); | ||
3951 | if (found_key.objectid != objectid || | ||
3952 | found_type != BTRFS_EXTENT_DATA_KEY) { | ||
3953 | goto not_found; | ||
3954 | } | ||
3955 | |||
3956 | found_type = btrfs_file_extent_type(leaf, item); | ||
3957 | extent_start = found_key.offset; | ||
3958 | compressed = btrfs_file_extent_compression(leaf, item); | ||
3959 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
3960 | found_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
3961 | extent_end = extent_start + | ||
3962 | btrfs_file_extent_num_bytes(leaf, item); | ||
3963 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
3964 | size_t size; | ||
3965 | size = btrfs_file_extent_inline_len(leaf, item); | ||
3966 | extent_end = (extent_start + size + root->sectorsize - 1) & | ||
3967 | ~((u64)root->sectorsize - 1); | ||
3968 | } | ||
3969 | |||
3970 | if (start >= extent_end) { | ||
3971 | path->slots[0]++; | ||
3972 | if (path->slots[0] >= btrfs_header_nritems(leaf)) { | ||
3973 | ret = btrfs_next_leaf(root, path); | ||
3974 | if (ret < 0) { | ||
3975 | err = ret; | ||
3976 | goto out; | ||
3977 | } | ||
3978 | if (ret > 0) | ||
3979 | goto not_found; | ||
3980 | leaf = path->nodes[0]; | ||
3981 | } | ||
3982 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
3983 | if (found_key.objectid != objectid || | ||
3984 | found_key.type != BTRFS_EXTENT_DATA_KEY) | ||
3985 | goto not_found; | ||
3986 | if (start + len <= found_key.offset) | ||
3987 | goto not_found; | ||
3988 | em->start = start; | ||
3989 | em->len = found_key.offset - start; | ||
3990 | goto not_found_em; | ||
3991 | } | ||
3992 | |||
3993 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
3994 | found_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
3995 | em->start = extent_start; | ||
3996 | em->len = extent_end - extent_start; | ||
3997 | em->orig_start = extent_start - | ||
3998 | btrfs_file_extent_offset(leaf, item); | ||
3999 | bytenr = btrfs_file_extent_disk_bytenr(leaf, item); | ||
4000 | if (bytenr == 0) { | ||
4001 | em->block_start = EXTENT_MAP_HOLE; | ||
4002 | goto insert; | ||
4003 | } | ||
4004 | if (compressed) { | ||
4005 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
4006 | em->block_start = bytenr; | ||
4007 | em->block_len = btrfs_file_extent_disk_num_bytes(leaf, | ||
4008 | item); | ||
4009 | } else { | ||
4010 | bytenr += btrfs_file_extent_offset(leaf, item); | ||
4011 | em->block_start = bytenr; | ||
4012 | em->block_len = em->len; | ||
4013 | if (found_type == BTRFS_FILE_EXTENT_PREALLOC) | ||
4014 | set_bit(EXTENT_FLAG_PREALLOC, &em->flags); | ||
4015 | } | ||
4016 | goto insert; | ||
4017 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
4018 | unsigned long ptr; | ||
4019 | char *map; | ||
4020 | size_t size; | ||
4021 | size_t extent_offset; | ||
4022 | size_t copy_size; | ||
4023 | |||
4024 | em->block_start = EXTENT_MAP_INLINE; | ||
4025 | if (!page || create) { | ||
4026 | em->start = extent_start; | ||
4027 | em->len = extent_end - extent_start; | ||
4028 | goto out; | ||
4029 | } | ||
4030 | |||
4031 | size = btrfs_file_extent_inline_len(leaf, item); | ||
4032 | extent_offset = page_offset(page) + pg_offset - extent_start; | ||
4033 | copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, | ||
4034 | size - extent_offset); | ||
4035 | em->start = extent_start + extent_offset; | ||
4036 | em->len = (copy_size + root->sectorsize - 1) & | ||
4037 | ~((u64)root->sectorsize - 1); | ||
4038 | em->orig_start = EXTENT_MAP_INLINE; | ||
4039 | if (compressed) | ||
4040 | set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
4041 | ptr = btrfs_file_extent_inline_start(item) + extent_offset; | ||
4042 | if (create == 0 && !PageUptodate(page)) { | ||
4043 | if (btrfs_file_extent_compression(leaf, item) == | ||
4044 | BTRFS_COMPRESS_ZLIB) { | ||
4045 | ret = uncompress_inline(path, inode, page, | ||
4046 | pg_offset, | ||
4047 | extent_offset, item); | ||
4048 | BUG_ON(ret); | ||
4049 | } else { | ||
4050 | map = kmap(page); | ||
4051 | read_extent_buffer(leaf, map + pg_offset, ptr, | ||
4052 | copy_size); | ||
4053 | kunmap(page); | ||
4054 | } | ||
4055 | flush_dcache_page(page); | ||
4056 | } else if (create && PageUptodate(page)) { | ||
4057 | if (!trans) { | ||
4058 | kunmap(page); | ||
4059 | free_extent_map(em); | ||
4060 | em = NULL; | ||
4061 | btrfs_release_path(root, path); | ||
4062 | trans = btrfs_join_transaction(root, 1); | ||
4063 | goto again; | ||
4064 | } | ||
4065 | map = kmap(page); | ||
4066 | write_extent_buffer(leaf, map + pg_offset, ptr, | ||
4067 | copy_size); | ||
4068 | kunmap(page); | ||
4069 | btrfs_mark_buffer_dirty(leaf); | ||
4070 | } | ||
4071 | set_extent_uptodate(io_tree, em->start, | ||
4072 | extent_map_end(em) - 1, GFP_NOFS); | ||
4073 | goto insert; | ||
4074 | } else { | ||
4075 | printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); | ||
4076 | WARN_ON(1); | ||
4077 | } | ||
4078 | not_found: | ||
4079 | em->start = start; | ||
4080 | em->len = len; | ||
4081 | not_found_em: | ||
4082 | em->block_start = EXTENT_MAP_HOLE; | ||
4083 | set_bit(EXTENT_FLAG_VACANCY, &em->flags); | ||
4084 | insert: | ||
4085 | btrfs_release_path(root, path); | ||
4086 | if (em->start > start || extent_map_end(em) <= start) { | ||
4087 | printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " | ||
4088 | "[%llu %llu]\n", (unsigned long long)em->start, | ||
4089 | (unsigned long long)em->len, | ||
4090 | (unsigned long long)start, | ||
4091 | (unsigned long long)len); | ||
4092 | err = -EIO; | ||
4093 | goto out; | ||
4094 | } | ||
4095 | |||
4096 | err = 0; | ||
4097 | spin_lock(&em_tree->lock); | ||
4098 | ret = add_extent_mapping(em_tree, em); | ||
4099 | /* it is possible that someone inserted the extent into the tree | ||
4100 | * while we had the lock dropped. It is also possible that | ||
4101 | * an overlapping map exists in the tree | ||
4102 | */ | ||
4103 | if (ret == -EEXIST) { | ||
4104 | struct extent_map *existing; | ||
4105 | |||
4106 | ret = 0; | ||
4107 | |||
4108 | existing = lookup_extent_mapping(em_tree, start, len); | ||
4109 | if (existing && (existing->start > start || | ||
4110 | existing->start + existing->len <= start)) { | ||
4111 | free_extent_map(existing); | ||
4112 | existing = NULL; | ||
4113 | } | ||
4114 | if (!existing) { | ||
4115 | existing = lookup_extent_mapping(em_tree, em->start, | ||
4116 | em->len); | ||
4117 | if (existing) { | ||
4118 | err = merge_extent_mapping(em_tree, existing, | ||
4119 | em, start, | ||
4120 | root->sectorsize); | ||
4121 | free_extent_map(existing); | ||
4122 | if (err) { | ||
4123 | free_extent_map(em); | ||
4124 | em = NULL; | ||
4125 | } | ||
4126 | } else { | ||
4127 | err = -EIO; | ||
4128 | free_extent_map(em); | ||
4129 | em = NULL; | ||
4130 | } | ||
4131 | } else { | ||
4132 | free_extent_map(em); | ||
4133 | em = existing; | ||
4134 | err = 0; | ||
4135 | } | ||
4136 | } | ||
4137 | spin_unlock(&em_tree->lock); | ||
4138 | out: | ||
4139 | if (path) | ||
4140 | btrfs_free_path(path); | ||
4141 | if (trans) { | ||
4142 | ret = btrfs_end_transaction(trans, root); | ||
4143 | if (!err) | ||
4144 | err = ret; | ||
4145 | } | ||
4146 | if (err) { | ||
4147 | free_extent_map(em); | ||
4148 | WARN_ON(1); | ||
4149 | return ERR_PTR(err); | ||
4150 | } | ||
4151 | return em; | ||
4152 | } | ||
4153 | |||
4154 | static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, | ||
4155 | const struct iovec *iov, loff_t offset, | ||
4156 | unsigned long nr_segs) | ||
4157 | { | ||
4158 | return -EINVAL; | ||
4159 | } | ||
4160 | |||
4161 | static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) | ||
4162 | { | ||
4163 | return extent_bmap(mapping, iblock, btrfs_get_extent); | ||
4164 | } | ||
4165 | |||
4166 | int btrfs_readpage(struct file *file, struct page *page) | ||
4167 | { | ||
4168 | struct extent_io_tree *tree; | ||
4169 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
4170 | return extent_read_full_page(tree, page, btrfs_get_extent); | ||
4171 | } | ||
4172 | |||
4173 | static int btrfs_writepage(struct page *page, struct writeback_control *wbc) | ||
4174 | { | ||
4175 | struct extent_io_tree *tree; | ||
4176 | |||
4177 | |||
4178 | if (current->flags & PF_MEMALLOC) { | ||
4179 | redirty_page_for_writepage(wbc, page); | ||
4180 | unlock_page(page); | ||
4181 | return 0; | ||
4182 | } | ||
4183 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
4184 | return extent_write_full_page(tree, page, btrfs_get_extent, wbc); | ||
4185 | } | ||
4186 | |||
4187 | int btrfs_writepages(struct address_space *mapping, | ||
4188 | struct writeback_control *wbc) | ||
4189 | { | ||
4190 | struct extent_io_tree *tree; | ||
4191 | |||
4192 | tree = &BTRFS_I(mapping->host)->io_tree; | ||
4193 | return extent_writepages(tree, mapping, btrfs_get_extent, wbc); | ||
4194 | } | ||
4195 | |||
4196 | static int | ||
4197 | btrfs_readpages(struct file *file, struct address_space *mapping, | ||
4198 | struct list_head *pages, unsigned nr_pages) | ||
4199 | { | ||
4200 | struct extent_io_tree *tree; | ||
4201 | tree = &BTRFS_I(mapping->host)->io_tree; | ||
4202 | return extent_readpages(tree, mapping, pages, nr_pages, | ||
4203 | btrfs_get_extent); | ||
4204 | } | ||
4205 | static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) | ||
4206 | { | ||
4207 | struct extent_io_tree *tree; | ||
4208 | struct extent_map_tree *map; | ||
4209 | int ret; | ||
4210 | |||
4211 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
4212 | map = &BTRFS_I(page->mapping->host)->extent_tree; | ||
4213 | ret = try_release_extent_mapping(map, tree, page, gfp_flags); | ||
4214 | if (ret == 1) { | ||
4215 | ClearPagePrivate(page); | ||
4216 | set_page_private(page, 0); | ||
4217 | page_cache_release(page); | ||
4218 | } | ||
4219 | return ret; | ||
4220 | } | ||
4221 | |||
4222 | static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) | ||
4223 | { | ||
4224 | if (PageWriteback(page) || PageDirty(page)) | ||
4225 | return 0; | ||
4226 | return __btrfs_releasepage(page, gfp_flags); | ||
4227 | } | ||
4228 | |||
4229 | static void btrfs_invalidatepage(struct page *page, unsigned long offset) | ||
4230 | { | ||
4231 | struct extent_io_tree *tree; | ||
4232 | struct btrfs_ordered_extent *ordered; | ||
4233 | u64 page_start = page_offset(page); | ||
4234 | u64 page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
4235 | |||
4236 | wait_on_page_writeback(page); | ||
4237 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
4238 | if (offset) { | ||
4239 | btrfs_releasepage(page, GFP_NOFS); | ||
4240 | return; | ||
4241 | } | ||
4242 | |||
4243 | lock_extent(tree, page_start, page_end, GFP_NOFS); | ||
4244 | ordered = btrfs_lookup_ordered_extent(page->mapping->host, | ||
4245 | page_offset(page)); | ||
4246 | if (ordered) { | ||
4247 | /* | ||
4248 | * IO on this page will never be started, so we need | ||
4249 | * to account for any ordered extents now | ||
4250 | */ | ||
4251 | clear_extent_bit(tree, page_start, page_end, | ||
4252 | EXTENT_DIRTY | EXTENT_DELALLOC | | ||
4253 | EXTENT_LOCKED, 1, 0, GFP_NOFS); | ||
4254 | btrfs_finish_ordered_io(page->mapping->host, | ||
4255 | page_start, page_end); | ||
4256 | btrfs_put_ordered_extent(ordered); | ||
4257 | lock_extent(tree, page_start, page_end, GFP_NOFS); | ||
4258 | } | ||
4259 | clear_extent_bit(tree, page_start, page_end, | ||
4260 | EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | | ||
4261 | EXTENT_ORDERED, | ||
4262 | 1, 1, GFP_NOFS); | ||
4263 | __btrfs_releasepage(page, GFP_NOFS); | ||
4264 | |||
4265 | ClearPageChecked(page); | ||
4266 | if (PagePrivate(page)) { | ||
4267 | ClearPagePrivate(page); | ||
4268 | set_page_private(page, 0); | ||
4269 | page_cache_release(page); | ||
4270 | } | ||
4271 | } | ||
4272 | |||
4273 | /* | ||
4274 | * btrfs_page_mkwrite() is not allowed to change the file size as it gets | ||
4275 | * called from a page fault handler when a page is first dirtied. Hence we must | ||
4276 | * be careful to check for EOF conditions here. We set the page up correctly | ||
4277 | * for a written page which means we get ENOSPC checking when writing into | ||
4278 | * holes and correct delalloc and unwritten extent mapping on filesystems that | ||
4279 | * support these features. | ||
4280 | * | ||
4281 | * We are not allowed to take the i_mutex here so we have to play games to | ||
4282 | * protect against truncate races as the page could now be beyond EOF. Because | ||
4283 | * vmtruncate() writes the inode size before removing pages, once we have the | ||
4284 | * page lock we can determine safely if the page is beyond EOF. If it is not | ||
4285 | * beyond EOF, then the page is guaranteed safe against truncation until we | ||
4286 | * unlock the page. | ||
4287 | */ | ||
4288 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) | ||
4289 | { | ||
4290 | struct inode *inode = fdentry(vma->vm_file)->d_inode; | ||
4291 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
4292 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
4293 | struct btrfs_ordered_extent *ordered; | ||
4294 | char *kaddr; | ||
4295 | unsigned long zero_start; | ||
4296 | loff_t size; | ||
4297 | int ret; | ||
4298 | u64 page_start; | ||
4299 | u64 page_end; | ||
4300 | |||
4301 | ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); | ||
4302 | if (ret) | ||
4303 | goto out; | ||
4304 | |||
4305 | ret = -EINVAL; | ||
4306 | again: | ||
4307 | lock_page(page); | ||
4308 | size = i_size_read(inode); | ||
4309 | page_start = page_offset(page); | ||
4310 | page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
4311 | |||
4312 | if ((page->mapping != inode->i_mapping) || | ||
4313 | (page_start >= size)) { | ||
4314 | /* page got truncated out from underneath us */ | ||
4315 | goto out_unlock; | ||
4316 | } | ||
4317 | wait_on_page_writeback(page); | ||
4318 | |||
4319 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
4320 | set_page_extent_mapped(page); | ||
4321 | |||
4322 | /* | ||
4323 | * we can't set the delalloc bits if there are pending ordered | ||
4324 | * extents. Drop our locks and wait for them to finish | ||
4325 | */ | ||
4326 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | ||
4327 | if (ordered) { | ||
4328 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
4329 | unlock_page(page); | ||
4330 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
4331 | btrfs_put_ordered_extent(ordered); | ||
4332 | goto again; | ||
4333 | } | ||
4334 | |||
4335 | btrfs_set_extent_delalloc(inode, page_start, page_end); | ||
4336 | ret = 0; | ||
4337 | |||
4338 | /* page is wholly or partially inside EOF */ | ||
4339 | if (page_start + PAGE_CACHE_SIZE > size) | ||
4340 | zero_start = size & ~PAGE_CACHE_MASK; | ||
4341 | else | ||
4342 | zero_start = PAGE_CACHE_SIZE; | ||
4343 | |||
4344 | if (zero_start != PAGE_CACHE_SIZE) { | ||
4345 | kaddr = kmap(page); | ||
4346 | memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); | ||
4347 | flush_dcache_page(page); | ||
4348 | kunmap(page); | ||
4349 | } | ||
4350 | ClearPageChecked(page); | ||
4351 | set_page_dirty(page); | ||
4352 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
4353 | |||
4354 | out_unlock: | ||
4355 | unlock_page(page); | ||
4356 | out: | ||
4357 | return ret; | ||
4358 | } | ||
4359 | |||
4360 | static void btrfs_truncate(struct inode *inode) | ||
4361 | { | ||
4362 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
4363 | int ret; | ||
4364 | struct btrfs_trans_handle *trans; | ||
4365 | unsigned long nr; | ||
4366 | u64 mask = root->sectorsize - 1; | ||
4367 | |||
4368 | if (!S_ISREG(inode->i_mode)) | ||
4369 | return; | ||
4370 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | ||
4371 | return; | ||
4372 | |||
4373 | btrfs_truncate_page(inode->i_mapping, inode->i_size); | ||
4374 | btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); | ||
4375 | |||
4376 | trans = btrfs_start_transaction(root, 1); | ||
4377 | btrfs_set_trans_block_group(trans, inode); | ||
4378 | btrfs_i_size_write(inode, inode->i_size); | ||
4379 | |||
4380 | ret = btrfs_orphan_add(trans, inode); | ||
4381 | if (ret) | ||
4382 | goto out; | ||
4383 | /* FIXME, add redo link to tree so we don't leak on crash */ | ||
4384 | ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, | ||
4385 | BTRFS_EXTENT_DATA_KEY); | ||
4386 | btrfs_update_inode(trans, root, inode); | ||
4387 | |||
4388 | ret = btrfs_orphan_del(trans, inode); | ||
4389 | BUG_ON(ret); | ||
4390 | |||
4391 | out: | ||
4392 | nr = trans->blocks_used; | ||
4393 | ret = btrfs_end_transaction_throttle(trans, root); | ||
4394 | BUG_ON(ret); | ||
4395 | btrfs_btree_balance_dirty(root, nr); | ||
4396 | } | ||
4397 | |||
4398 | /* | ||
4399 | * create a new subvolume directory/inode (helper for the ioctl). | ||
4400 | */ | ||
4401 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | ||
4402 | struct btrfs_root *new_root, struct dentry *dentry, | ||
4403 | u64 new_dirid, u64 alloc_hint) | ||
4404 | { | ||
4405 | struct inode *inode; | ||
4406 | int error; | ||
4407 | u64 index = 0; | ||
4408 | |||
4409 | inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, | ||
4410 | new_dirid, alloc_hint, S_IFDIR | 0700, &index); | ||
4411 | if (IS_ERR(inode)) | ||
4412 | return PTR_ERR(inode); | ||
4413 | inode->i_op = &btrfs_dir_inode_operations; | ||
4414 | inode->i_fop = &btrfs_dir_file_operations; | ||
4415 | |||
4416 | inode->i_nlink = 1; | ||
4417 | btrfs_i_size_write(inode, 0); | ||
4418 | |||
4419 | error = btrfs_update_inode(trans, new_root, inode); | ||
4420 | if (error) | ||
4421 | return error; | ||
4422 | |||
4423 | d_instantiate(dentry, inode); | ||
4424 | return 0; | ||
4425 | } | ||
4426 | |||
4427 | /* helper function for file defrag and space balancing. This | ||
4428 | * forces readahead on a given range of bytes in an inode | ||
4429 | */ | ||
4430 | unsigned long btrfs_force_ra(struct address_space *mapping, | ||
4431 | struct file_ra_state *ra, struct file *file, | ||
4432 | pgoff_t offset, pgoff_t last_index) | ||
4433 | { | ||
4434 | pgoff_t req_size = last_index - offset + 1; | ||
4435 | |||
4436 | page_cache_sync_readahead(mapping, ra, file, offset, req_size); | ||
4437 | return offset + req_size; | ||
4438 | } | ||
4439 | |||
4440 | struct inode *btrfs_alloc_inode(struct super_block *sb) | ||
4441 | { | ||
4442 | struct btrfs_inode *ei; | ||
4443 | |||
4444 | ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); | ||
4445 | if (!ei) | ||
4446 | return NULL; | ||
4447 | ei->last_trans = 0; | ||
4448 | ei->logged_trans = 0; | ||
4449 | btrfs_ordered_inode_tree_init(&ei->ordered_tree); | ||
4450 | ei->i_acl = BTRFS_ACL_NOT_CACHED; | ||
4451 | ei->i_default_acl = BTRFS_ACL_NOT_CACHED; | ||
4452 | INIT_LIST_HEAD(&ei->i_orphan); | ||
4453 | return &ei->vfs_inode; | ||
4454 | } | ||
4455 | |||
4456 | void btrfs_destroy_inode(struct inode *inode) | ||
4457 | { | ||
4458 | struct btrfs_ordered_extent *ordered; | ||
4459 | WARN_ON(!list_empty(&inode->i_dentry)); | ||
4460 | WARN_ON(inode->i_data.nrpages); | ||
4461 | |||
4462 | if (BTRFS_I(inode)->i_acl && | ||
4463 | BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED) | ||
4464 | posix_acl_release(BTRFS_I(inode)->i_acl); | ||
4465 | if (BTRFS_I(inode)->i_default_acl && | ||
4466 | BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) | ||
4467 | posix_acl_release(BTRFS_I(inode)->i_default_acl); | ||
4468 | |||
4469 | spin_lock(&BTRFS_I(inode)->root->list_lock); | ||
4470 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { | ||
4471 | printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" | ||
4472 | " list\n", inode->i_ino); | ||
4473 | dump_stack(); | ||
4474 | } | ||
4475 | spin_unlock(&BTRFS_I(inode)->root->list_lock); | ||
4476 | |||
4477 | while (1) { | ||
4478 | ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); | ||
4479 | if (!ordered) | ||
4480 | break; | ||
4481 | else { | ||
4482 | printk(KERN_ERR "btrfs found ordered " | ||
4483 | "extent %llu %llu on inode cleanup\n", | ||
4484 | (unsigned long long)ordered->file_offset, | ||
4485 | (unsigned long long)ordered->len); | ||
4486 | btrfs_remove_ordered_extent(inode, ordered); | ||
4487 | btrfs_put_ordered_extent(ordered); | ||
4488 | btrfs_put_ordered_extent(ordered); | ||
4489 | } | ||
4490 | } | ||
4491 | btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); | ||
4492 | kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); | ||
4493 | } | ||
4494 | |||
4495 | static void init_once(void *foo) | ||
4496 | { | ||
4497 | struct btrfs_inode *ei = (struct btrfs_inode *) foo; | ||
4498 | |||
4499 | inode_init_once(&ei->vfs_inode); | ||
4500 | } | ||
4501 | |||
4502 | void btrfs_destroy_cachep(void) | ||
4503 | { | ||
4504 | if (btrfs_inode_cachep) | ||
4505 | kmem_cache_destroy(btrfs_inode_cachep); | ||
4506 | if (btrfs_trans_handle_cachep) | ||
4507 | kmem_cache_destroy(btrfs_trans_handle_cachep); | ||
4508 | if (btrfs_transaction_cachep) | ||
4509 | kmem_cache_destroy(btrfs_transaction_cachep); | ||
4510 | if (btrfs_bit_radix_cachep) | ||
4511 | kmem_cache_destroy(btrfs_bit_radix_cachep); | ||
4512 | if (btrfs_path_cachep) | ||
4513 | kmem_cache_destroy(btrfs_path_cachep); | ||
4514 | } | ||
4515 | |||
4516 | struct kmem_cache *btrfs_cache_create(const char *name, size_t size, | ||
4517 | unsigned long extra_flags, | ||
4518 | void (*ctor)(void *)) | ||
4519 | { | ||
4520 | return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | | ||
4521 | SLAB_MEM_SPREAD | extra_flags), ctor); | ||
4522 | } | ||
4523 | |||
4524 | int btrfs_init_cachep(void) | ||
4525 | { | ||
4526 | btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", | ||
4527 | sizeof(struct btrfs_inode), | ||
4528 | 0, init_once); | ||
4529 | if (!btrfs_inode_cachep) | ||
4530 | goto fail; | ||
4531 | btrfs_trans_handle_cachep = | ||
4532 | btrfs_cache_create("btrfs_trans_handle_cache", | ||
4533 | sizeof(struct btrfs_trans_handle), | ||
4534 | 0, NULL); | ||
4535 | if (!btrfs_trans_handle_cachep) | ||
4536 | goto fail; | ||
4537 | btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", | ||
4538 | sizeof(struct btrfs_transaction), | ||
4539 | 0, NULL); | ||
4540 | if (!btrfs_transaction_cachep) | ||
4541 | goto fail; | ||
4542 | btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", | ||
4543 | sizeof(struct btrfs_path), | ||
4544 | 0, NULL); | ||
4545 | if (!btrfs_path_cachep) | ||
4546 | goto fail; | ||
4547 | btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, | ||
4548 | SLAB_DESTROY_BY_RCU, NULL); | ||
4549 | if (!btrfs_bit_radix_cachep) | ||
4550 | goto fail; | ||
4551 | return 0; | ||
4552 | fail: | ||
4553 | btrfs_destroy_cachep(); | ||
4554 | return -ENOMEM; | ||
4555 | } | ||
4556 | |||
4557 | static int btrfs_getattr(struct vfsmount *mnt, | ||
4558 | struct dentry *dentry, struct kstat *stat) | ||
4559 | { | ||
4560 | struct inode *inode = dentry->d_inode; | ||
4561 | generic_fillattr(inode, stat); | ||
4562 | stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; | ||
4563 | stat->blksize = PAGE_CACHE_SIZE; | ||
4564 | stat->blocks = (inode_get_bytes(inode) + | ||
4565 | BTRFS_I(inode)->delalloc_bytes) >> 9; | ||
4566 | return 0; | ||
4567 | } | ||
4568 | |||
4569 | static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
4570 | struct inode *new_dir, struct dentry *new_dentry) | ||
4571 | { | ||
4572 | struct btrfs_trans_handle *trans; | ||
4573 | struct btrfs_root *root = BTRFS_I(old_dir)->root; | ||
4574 | struct inode *new_inode = new_dentry->d_inode; | ||
4575 | struct inode *old_inode = old_dentry->d_inode; | ||
4576 | struct timespec ctime = CURRENT_TIME; | ||
4577 | u64 index = 0; | ||
4578 | int ret; | ||
4579 | |||
4580 | /* we're not allowed to rename between subvolumes */ | ||
4581 | if (BTRFS_I(old_inode)->root->root_key.objectid != | ||
4582 | BTRFS_I(new_dir)->root->root_key.objectid) | ||
4583 | return -EXDEV; | ||
4584 | |||
4585 | if (S_ISDIR(old_inode->i_mode) && new_inode && | ||
4586 | new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { | ||
4587 | return -ENOTEMPTY; | ||
4588 | } | ||
4589 | |||
4590 | /* to rename a snapshot or subvolume, we need to juggle the | ||
4591 | * backrefs. This isn't coded yet | ||
4592 | */ | ||
4593 | if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) | ||
4594 | return -EXDEV; | ||
4595 | |||
4596 | ret = btrfs_check_free_space(root, 1, 0); | ||
4597 | if (ret) | ||
4598 | goto out_unlock; | ||
4599 | |||
4600 | trans = btrfs_start_transaction(root, 1); | ||
4601 | |||
4602 | btrfs_set_trans_block_group(trans, new_dir); | ||
4603 | |||
4604 | btrfs_inc_nlink(old_dentry->d_inode); | ||
4605 | old_dir->i_ctime = old_dir->i_mtime = ctime; | ||
4606 | new_dir->i_ctime = new_dir->i_mtime = ctime; | ||
4607 | old_inode->i_ctime = ctime; | ||
4608 | |||
4609 | ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, | ||
4610 | old_dentry->d_name.name, | ||
4611 | old_dentry->d_name.len); | ||
4612 | if (ret) | ||
4613 | goto out_fail; | ||
4614 | |||
4615 | if (new_inode) { | ||
4616 | new_inode->i_ctime = CURRENT_TIME; | ||
4617 | ret = btrfs_unlink_inode(trans, root, new_dir, | ||
4618 | new_dentry->d_inode, | ||
4619 | new_dentry->d_name.name, | ||
4620 | new_dentry->d_name.len); | ||
4621 | if (ret) | ||
4622 | goto out_fail; | ||
4623 | if (new_inode->i_nlink == 0) { | ||
4624 | ret = btrfs_orphan_add(trans, new_dentry->d_inode); | ||
4625 | if (ret) | ||
4626 | goto out_fail; | ||
4627 | } | ||
4628 | |||
4629 | } | ||
4630 | ret = btrfs_set_inode_index(new_dir, &index); | ||
4631 | if (ret) | ||
4632 | goto out_fail; | ||
4633 | |||
4634 | ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, | ||
4635 | old_inode, new_dentry->d_name.name, | ||
4636 | new_dentry->d_name.len, 1, index); | ||
4637 | if (ret) | ||
4638 | goto out_fail; | ||
4639 | |||
4640 | out_fail: | ||
4641 | btrfs_end_transaction_throttle(trans, root); | ||
4642 | out_unlock: | ||
4643 | return ret; | ||
4644 | } | ||
4645 | |||
4646 | /* | ||
4647 | * some fairly slow code that needs optimization. This walks the list | ||
4648 | * of all the inodes with pending delalloc and forces them to disk. | ||
4649 | */ | ||
4650 | int btrfs_start_delalloc_inodes(struct btrfs_root *root) | ||
4651 | { | ||
4652 | struct list_head *head = &root->fs_info->delalloc_inodes; | ||
4653 | struct btrfs_inode *binode; | ||
4654 | struct inode *inode; | ||
4655 | |||
4656 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
4657 | return -EROFS; | ||
4658 | |||
4659 | spin_lock(&root->fs_info->delalloc_lock); | ||
4660 | while (!list_empty(head)) { | ||
4661 | binode = list_entry(head->next, struct btrfs_inode, | ||
4662 | delalloc_inodes); | ||
4663 | inode = igrab(&binode->vfs_inode); | ||
4664 | if (!inode) | ||
4665 | list_del_init(&binode->delalloc_inodes); | ||
4666 | spin_unlock(&root->fs_info->delalloc_lock); | ||
4667 | if (inode) { | ||
4668 | filemap_flush(inode->i_mapping); | ||
4669 | iput(inode); | ||
4670 | } | ||
4671 | cond_resched(); | ||
4672 | spin_lock(&root->fs_info->delalloc_lock); | ||
4673 | } | ||
4674 | spin_unlock(&root->fs_info->delalloc_lock); | ||
4675 | |||
4676 | /* the filemap_flush will queue IO into the worker threads, but | ||
4677 | * we have to make sure the IO is actually started and that | ||
4678 | * ordered extents get created before we return | ||
4679 | */ | ||
4680 | atomic_inc(&root->fs_info->async_submit_draining); | ||
4681 | while (atomic_read(&root->fs_info->nr_async_submits) || | ||
4682 | atomic_read(&root->fs_info->async_delalloc_pages)) { | ||
4683 | wait_event(root->fs_info->async_submit_wait, | ||
4684 | (atomic_read(&root->fs_info->nr_async_submits) == 0 && | ||
4685 | atomic_read(&root->fs_info->async_delalloc_pages) == 0)); | ||
4686 | } | ||
4687 | atomic_dec(&root->fs_info->async_submit_draining); | ||
4688 | return 0; | ||
4689 | } | ||
4690 | |||
4691 | static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | ||
4692 | const char *symname) | ||
4693 | { | ||
4694 | struct btrfs_trans_handle *trans; | ||
4695 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
4696 | struct btrfs_path *path; | ||
4697 | struct btrfs_key key; | ||
4698 | struct inode *inode = NULL; | ||
4699 | int err; | ||
4700 | int drop_inode = 0; | ||
4701 | u64 objectid; | ||
4702 | u64 index = 0 ; | ||
4703 | int name_len; | ||
4704 | int datasize; | ||
4705 | unsigned long ptr; | ||
4706 | struct btrfs_file_extent_item *ei; | ||
4707 | struct extent_buffer *leaf; | ||
4708 | unsigned long nr = 0; | ||
4709 | |||
4710 | name_len = strlen(symname) + 1; | ||
4711 | if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) | ||
4712 | return -ENAMETOOLONG; | ||
4713 | |||
4714 | err = btrfs_check_free_space(root, 1, 0); | ||
4715 | if (err) | ||
4716 | goto out_fail; | ||
4717 | |||
4718 | trans = btrfs_start_transaction(root, 1); | ||
4719 | btrfs_set_trans_block_group(trans, dir); | ||
4720 | |||
4721 | err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); | ||
4722 | if (err) { | ||
4723 | err = -ENOSPC; | ||
4724 | goto out_unlock; | ||
4725 | } | ||
4726 | |||
4727 | inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, | ||
4728 | dentry->d_name.len, | ||
4729 | dentry->d_parent->d_inode->i_ino, objectid, | ||
4730 | BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, | ||
4731 | &index); | ||
4732 | err = PTR_ERR(inode); | ||
4733 | if (IS_ERR(inode)) | ||
4734 | goto out_unlock; | ||
4735 | |||
4736 | err = btrfs_init_acl(inode, dir); | ||
4737 | if (err) { | ||
4738 | drop_inode = 1; | ||
4739 | goto out_unlock; | ||
4740 | } | ||
4741 | |||
4742 | btrfs_set_trans_block_group(trans, inode); | ||
4743 | err = btrfs_add_nondir(trans, dentry, inode, 0, index); | ||
4744 | if (err) | ||
4745 | drop_inode = 1; | ||
4746 | else { | ||
4747 | inode->i_mapping->a_ops = &btrfs_aops; | ||
4748 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | ||
4749 | inode->i_fop = &btrfs_file_operations; | ||
4750 | inode->i_op = &btrfs_file_inode_operations; | ||
4751 | BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; | ||
4752 | } | ||
4753 | dir->i_sb->s_dirt = 1; | ||
4754 | btrfs_update_inode_block_group(trans, inode); | ||
4755 | btrfs_update_inode_block_group(trans, dir); | ||
4756 | if (drop_inode) | ||
4757 | goto out_unlock; | ||
4758 | |||
4759 | path = btrfs_alloc_path(); | ||
4760 | BUG_ON(!path); | ||
4761 | key.objectid = inode->i_ino; | ||
4762 | key.offset = 0; | ||
4763 | btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); | ||
4764 | datasize = btrfs_file_extent_calc_inline_size(name_len); | ||
4765 | err = btrfs_insert_empty_item(trans, root, path, &key, | ||
4766 | datasize); | ||
4767 | if (err) { | ||
4768 | drop_inode = 1; | ||
4769 | goto out_unlock; | ||
4770 | } | ||
4771 | leaf = path->nodes[0]; | ||
4772 | ei = btrfs_item_ptr(leaf, path->slots[0], | ||
4773 | struct btrfs_file_extent_item); | ||
4774 | btrfs_set_file_extent_generation(leaf, ei, trans->transid); | ||
4775 | btrfs_set_file_extent_type(leaf, ei, | ||
4776 | BTRFS_FILE_EXTENT_INLINE); | ||
4777 | btrfs_set_file_extent_encryption(leaf, ei, 0); | ||
4778 | btrfs_set_file_extent_compression(leaf, ei, 0); | ||
4779 | btrfs_set_file_extent_other_encoding(leaf, ei, 0); | ||
4780 | btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); | ||
4781 | |||
4782 | ptr = btrfs_file_extent_inline_start(ei); | ||
4783 | write_extent_buffer(leaf, symname, ptr, name_len); | ||
4784 | btrfs_mark_buffer_dirty(leaf); | ||
4785 | btrfs_free_path(path); | ||
4786 | |||
4787 | inode->i_op = &btrfs_symlink_inode_operations; | ||
4788 | inode->i_mapping->a_ops = &btrfs_symlink_aops; | ||
4789 | inode->i_mapping->backing_dev_info = &root->fs_info->bdi; | ||
4790 | inode_set_bytes(inode, name_len); | ||
4791 | btrfs_i_size_write(inode, name_len - 1); | ||
4792 | err = btrfs_update_inode(trans, root, inode); | ||
4793 | if (err) | ||
4794 | drop_inode = 1; | ||
4795 | |||
4796 | out_unlock: | ||
4797 | nr = trans->blocks_used; | ||
4798 | btrfs_end_transaction_throttle(trans, root); | ||
4799 | out_fail: | ||
4800 | if (drop_inode) { | ||
4801 | inode_dec_link_count(inode); | ||
4802 | iput(inode); | ||
4803 | } | ||
4804 | btrfs_btree_balance_dirty(root, nr); | ||
4805 | return err; | ||
4806 | } | ||
4807 | |||
4808 | static int prealloc_file_range(struct inode *inode, u64 start, u64 end, | ||
4809 | u64 alloc_hint, int mode) | ||
4810 | { | ||
4811 | struct btrfs_trans_handle *trans; | ||
4812 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
4813 | struct btrfs_key ins; | ||
4814 | u64 alloc_size; | ||
4815 | u64 cur_offset = start; | ||
4816 | u64 num_bytes = end - start; | ||
4817 | int ret = 0; | ||
4818 | |||
4819 | trans = btrfs_join_transaction(root, 1); | ||
4820 | BUG_ON(!trans); | ||
4821 | btrfs_set_trans_block_group(trans, inode); | ||
4822 | |||
4823 | while (num_bytes > 0) { | ||
4824 | alloc_size = min(num_bytes, root->fs_info->max_extent); | ||
4825 | ret = btrfs_reserve_extent(trans, root, alloc_size, | ||
4826 | root->sectorsize, 0, alloc_hint, | ||
4827 | (u64)-1, &ins, 1); | ||
4828 | if (ret) { | ||
4829 | WARN_ON(1); | ||
4830 | goto out; | ||
4831 | } | ||
4832 | ret = insert_reserved_file_extent(trans, inode, | ||
4833 | cur_offset, ins.objectid, | ||
4834 | ins.offset, ins.offset, | ||
4835 | ins.offset, 0, 0, 0, | ||
4836 | BTRFS_FILE_EXTENT_PREALLOC); | ||
4837 | BUG_ON(ret); | ||
4838 | num_bytes -= ins.offset; | ||
4839 | cur_offset += ins.offset; | ||
4840 | alloc_hint = ins.objectid + ins.offset; | ||
4841 | } | ||
4842 | out: | ||
4843 | if (cur_offset > start) { | ||
4844 | inode->i_ctime = CURRENT_TIME; | ||
4845 | btrfs_set_flag(inode, PREALLOC); | ||
4846 | if (!(mode & FALLOC_FL_KEEP_SIZE) && | ||
4847 | cur_offset > i_size_read(inode)) | ||
4848 | btrfs_i_size_write(inode, cur_offset); | ||
4849 | ret = btrfs_update_inode(trans, root, inode); | ||
4850 | BUG_ON(ret); | ||
4851 | } | ||
4852 | |||
4853 | btrfs_end_transaction(trans, root); | ||
4854 | return ret; | ||
4855 | } | ||
4856 | |||
4857 | static long btrfs_fallocate(struct inode *inode, int mode, | ||
4858 | loff_t offset, loff_t len) | ||
4859 | { | ||
4860 | u64 cur_offset; | ||
4861 | u64 last_byte; | ||
4862 | u64 alloc_start; | ||
4863 | u64 alloc_end; | ||
4864 | u64 alloc_hint = 0; | ||
4865 | u64 mask = BTRFS_I(inode)->root->sectorsize - 1; | ||
4866 | struct extent_map *em; | ||
4867 | int ret; | ||
4868 | |||
4869 | alloc_start = offset & ~mask; | ||
4870 | alloc_end = (offset + len + mask) & ~mask; | ||
4871 | |||
4872 | mutex_lock(&inode->i_mutex); | ||
4873 | if (alloc_start > inode->i_size) { | ||
4874 | ret = btrfs_cont_expand(inode, alloc_start); | ||
4875 | if (ret) | ||
4876 | goto out; | ||
4877 | } | ||
4878 | |||
4879 | while (1) { | ||
4880 | struct btrfs_ordered_extent *ordered; | ||
4881 | lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, | ||
4882 | alloc_end - 1, GFP_NOFS); | ||
4883 | ordered = btrfs_lookup_first_ordered_extent(inode, | ||
4884 | alloc_end - 1); | ||
4885 | if (ordered && | ||
4886 | ordered->file_offset + ordered->len > alloc_start && | ||
4887 | ordered->file_offset < alloc_end) { | ||
4888 | btrfs_put_ordered_extent(ordered); | ||
4889 | unlock_extent(&BTRFS_I(inode)->io_tree, | ||
4890 | alloc_start, alloc_end - 1, GFP_NOFS); | ||
4891 | btrfs_wait_ordered_range(inode, alloc_start, | ||
4892 | alloc_end - alloc_start); | ||
4893 | } else { | ||
4894 | if (ordered) | ||
4895 | btrfs_put_ordered_extent(ordered); | ||
4896 | break; | ||
4897 | } | ||
4898 | } | ||
4899 | |||
4900 | cur_offset = alloc_start; | ||
4901 | while (1) { | ||
4902 | em = btrfs_get_extent(inode, NULL, 0, cur_offset, | ||
4903 | alloc_end - cur_offset, 0); | ||
4904 | BUG_ON(IS_ERR(em) || !em); | ||
4905 | last_byte = min(extent_map_end(em), alloc_end); | ||
4906 | last_byte = (last_byte + mask) & ~mask; | ||
4907 | if (em->block_start == EXTENT_MAP_HOLE) { | ||
4908 | ret = prealloc_file_range(inode, cur_offset, | ||
4909 | last_byte, alloc_hint, mode); | ||
4910 | if (ret < 0) { | ||
4911 | free_extent_map(em); | ||
4912 | break; | ||
4913 | } | ||
4914 | } | ||
4915 | if (em->block_start <= EXTENT_MAP_LAST_BYTE) | ||
4916 | alloc_hint = em->block_start; | ||
4917 | free_extent_map(em); | ||
4918 | |||
4919 | cur_offset = last_byte; | ||
4920 | if (cur_offset >= alloc_end) { | ||
4921 | ret = 0; | ||
4922 | break; | ||
4923 | } | ||
4924 | } | ||
4925 | unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, | ||
4926 | GFP_NOFS); | ||
4927 | out: | ||
4928 | mutex_unlock(&inode->i_mutex); | ||
4929 | return ret; | ||
4930 | } | ||
4931 | |||
4932 | static int btrfs_set_page_dirty(struct page *page) | ||
4933 | { | ||
4934 | return __set_page_dirty_nobuffers(page); | ||
4935 | } | ||
4936 | |||
4937 | static int btrfs_permission(struct inode *inode, int mask) | ||
4938 | { | ||
4939 | if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) | ||
4940 | return -EACCES; | ||
4941 | return generic_permission(inode, mask, btrfs_check_acl); | ||
4942 | } | ||
4943 | |||
4944 | static struct inode_operations btrfs_dir_inode_operations = { | ||
4945 | .getattr = btrfs_getattr, | ||
4946 | .lookup = btrfs_lookup, | ||
4947 | .create = btrfs_create, | ||
4948 | .unlink = btrfs_unlink, | ||
4949 | .link = btrfs_link, | ||
4950 | .mkdir = btrfs_mkdir, | ||
4951 | .rmdir = btrfs_rmdir, | ||
4952 | .rename = btrfs_rename, | ||
4953 | .symlink = btrfs_symlink, | ||
4954 | .setattr = btrfs_setattr, | ||
4955 | .mknod = btrfs_mknod, | ||
4956 | .setxattr = btrfs_setxattr, | ||
4957 | .getxattr = btrfs_getxattr, | ||
4958 | .listxattr = btrfs_listxattr, | ||
4959 | .removexattr = btrfs_removexattr, | ||
4960 | .permission = btrfs_permission, | ||
4961 | }; | ||
4962 | static struct inode_operations btrfs_dir_ro_inode_operations = { | ||
4963 | .lookup = btrfs_lookup, | ||
4964 | .permission = btrfs_permission, | ||
4965 | }; | ||
4966 | static struct file_operations btrfs_dir_file_operations = { | ||
4967 | .llseek = generic_file_llseek, | ||
4968 | .read = generic_read_dir, | ||
4969 | .readdir = btrfs_real_readdir, | ||
4970 | .unlocked_ioctl = btrfs_ioctl, | ||
4971 | #ifdef CONFIG_COMPAT | ||
4972 | .compat_ioctl = btrfs_ioctl, | ||
4973 | #endif | ||
4974 | .release = btrfs_release_file, | ||
4975 | .fsync = btrfs_sync_file, | ||
4976 | }; | ||
4977 | |||
4978 | static struct extent_io_ops btrfs_extent_io_ops = { | ||
4979 | .fill_delalloc = run_delalloc_range, | ||
4980 | .submit_bio_hook = btrfs_submit_bio_hook, | ||
4981 | .merge_bio_hook = btrfs_merge_bio_hook, | ||
4982 | .readpage_end_io_hook = btrfs_readpage_end_io_hook, | ||
4983 | .writepage_end_io_hook = btrfs_writepage_end_io_hook, | ||
4984 | .writepage_start_hook = btrfs_writepage_start_hook, | ||
4985 | .readpage_io_failed_hook = btrfs_io_failed_hook, | ||
4986 | .set_bit_hook = btrfs_set_bit_hook, | ||
4987 | .clear_bit_hook = btrfs_clear_bit_hook, | ||
4988 | }; | ||
4989 | |||
4990 | static struct address_space_operations btrfs_aops = { | ||
4991 | .readpage = btrfs_readpage, | ||
4992 | .writepage = btrfs_writepage, | ||
4993 | .writepages = btrfs_writepages, | ||
4994 | .readpages = btrfs_readpages, | ||
4995 | .sync_page = block_sync_page, | ||
4996 | .bmap = btrfs_bmap, | ||
4997 | .direct_IO = btrfs_direct_IO, | ||
4998 | .invalidatepage = btrfs_invalidatepage, | ||
4999 | .releasepage = btrfs_releasepage, | ||
5000 | .set_page_dirty = btrfs_set_page_dirty, | ||
5001 | }; | ||
5002 | |||
5003 | static struct address_space_operations btrfs_symlink_aops = { | ||
5004 | .readpage = btrfs_readpage, | ||
5005 | .writepage = btrfs_writepage, | ||
5006 | .invalidatepage = btrfs_invalidatepage, | ||
5007 | .releasepage = btrfs_releasepage, | ||
5008 | }; | ||
5009 | |||
5010 | static struct inode_operations btrfs_file_inode_operations = { | ||
5011 | .truncate = btrfs_truncate, | ||
5012 | .getattr = btrfs_getattr, | ||
5013 | .setattr = btrfs_setattr, | ||
5014 | .setxattr = btrfs_setxattr, | ||
5015 | .getxattr = btrfs_getxattr, | ||
5016 | .listxattr = btrfs_listxattr, | ||
5017 | .removexattr = btrfs_removexattr, | ||
5018 | .permission = btrfs_permission, | ||
5019 | .fallocate = btrfs_fallocate, | ||
5020 | }; | ||
5021 | static struct inode_operations btrfs_special_inode_operations = { | ||
5022 | .getattr = btrfs_getattr, | ||
5023 | .setattr = btrfs_setattr, | ||
5024 | .permission = btrfs_permission, | ||
5025 | .setxattr = btrfs_setxattr, | ||
5026 | .getxattr = btrfs_getxattr, | ||
5027 | .listxattr = btrfs_listxattr, | ||
5028 | .removexattr = btrfs_removexattr, | ||
5029 | }; | ||
5030 | static struct inode_operations btrfs_symlink_inode_operations = { | ||
5031 | .readlink = generic_readlink, | ||
5032 | .follow_link = page_follow_link_light, | ||
5033 | .put_link = page_put_link, | ||
5034 | .permission = btrfs_permission, | ||
5035 | }; | ||
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 000000000000..c2aa33e3feb5 --- /dev/null +++ b/fs/btrfs/ioctl.c | |||
@@ -0,0 +1,1132 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/bio.h> | ||
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/file.h> | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/fsnotify.h> | ||
25 | #include <linux/pagemap.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/time.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/backing-dev.h> | ||
32 | #include <linux/mount.h> | ||
33 | #include <linux/mpage.h> | ||
34 | #include <linux/namei.h> | ||
35 | #include <linux/swap.h> | ||
36 | #include <linux/writeback.h> | ||
37 | #include <linux/statfs.h> | ||
38 | #include <linux/compat.h> | ||
39 | #include <linux/bit_spinlock.h> | ||
40 | #include <linux/security.h> | ||
41 | #include <linux/version.h> | ||
42 | #include <linux/xattr.h> | ||
43 | #include <linux/vmalloc.h> | ||
44 | #include "compat.h" | ||
45 | #include "ctree.h" | ||
46 | #include "disk-io.h" | ||
47 | #include "transaction.h" | ||
48 | #include "btrfs_inode.h" | ||
49 | #include "ioctl.h" | ||
50 | #include "print-tree.h" | ||
51 | #include "volumes.h" | ||
52 | #include "locking.h" | ||
53 | |||
54 | |||
55 | |||
56 | static noinline int create_subvol(struct btrfs_root *root, | ||
57 | struct dentry *dentry, | ||
58 | char *name, int namelen) | ||
59 | { | ||
60 | struct btrfs_trans_handle *trans; | ||
61 | struct btrfs_key key; | ||
62 | struct btrfs_root_item root_item; | ||
63 | struct btrfs_inode_item *inode_item; | ||
64 | struct extent_buffer *leaf; | ||
65 | struct btrfs_root *new_root = root; | ||
66 | struct inode *dir; | ||
67 | int ret; | ||
68 | int err; | ||
69 | u64 objectid; | ||
70 | u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; | ||
71 | u64 index = 0; | ||
72 | unsigned long nr = 1; | ||
73 | |||
74 | ret = btrfs_check_free_space(root, 1, 0); | ||
75 | if (ret) | ||
76 | goto fail_commit; | ||
77 | |||
78 | trans = btrfs_start_transaction(root, 1); | ||
79 | BUG_ON(!trans); | ||
80 | |||
81 | ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, | ||
82 | 0, &objectid); | ||
83 | if (ret) | ||
84 | goto fail; | ||
85 | |||
86 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, | ||
87 | objectid, trans->transid, 0, 0, 0); | ||
88 | if (IS_ERR(leaf)) { | ||
89 | ret = PTR_ERR(leaf); | ||
90 | goto fail; | ||
91 | } | ||
92 | |||
93 | btrfs_set_header_nritems(leaf, 0); | ||
94 | btrfs_set_header_level(leaf, 0); | ||
95 | btrfs_set_header_bytenr(leaf, leaf->start); | ||
96 | btrfs_set_header_generation(leaf, trans->transid); | ||
97 | btrfs_set_header_owner(leaf, objectid); | ||
98 | |||
99 | write_extent_buffer(leaf, root->fs_info->fsid, | ||
100 | (unsigned long)btrfs_header_fsid(leaf), | ||
101 | BTRFS_FSID_SIZE); | ||
102 | btrfs_mark_buffer_dirty(leaf); | ||
103 | |||
104 | inode_item = &root_item.inode; | ||
105 | memset(inode_item, 0, sizeof(*inode_item)); | ||
106 | inode_item->generation = cpu_to_le64(1); | ||
107 | inode_item->size = cpu_to_le64(3); | ||
108 | inode_item->nlink = cpu_to_le32(1); | ||
109 | inode_item->nbytes = cpu_to_le64(root->leafsize); | ||
110 | inode_item->mode = cpu_to_le32(S_IFDIR | 0755); | ||
111 | |||
112 | btrfs_set_root_bytenr(&root_item, leaf->start); | ||
113 | btrfs_set_root_generation(&root_item, trans->transid); | ||
114 | btrfs_set_root_level(&root_item, 0); | ||
115 | btrfs_set_root_refs(&root_item, 1); | ||
116 | btrfs_set_root_used(&root_item, 0); | ||
117 | btrfs_set_root_last_snapshot(&root_item, 0); | ||
118 | |||
119 | memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); | ||
120 | root_item.drop_level = 0; | ||
121 | |||
122 | btrfs_tree_unlock(leaf); | ||
123 | free_extent_buffer(leaf); | ||
124 | leaf = NULL; | ||
125 | |||
126 | btrfs_set_root_dirid(&root_item, new_dirid); | ||
127 | |||
128 | key.objectid = objectid; | ||
129 | key.offset = 1; | ||
130 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
131 | ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, | ||
132 | &root_item); | ||
133 | if (ret) | ||
134 | goto fail; | ||
135 | |||
136 | /* | ||
137 | * insert the directory item | ||
138 | */ | ||
139 | key.offset = (u64)-1; | ||
140 | dir = dentry->d_parent->d_inode; | ||
141 | ret = btrfs_set_inode_index(dir, &index); | ||
142 | BUG_ON(ret); | ||
143 | |||
144 | ret = btrfs_insert_dir_item(trans, root, | ||
145 | name, namelen, dir->i_ino, &key, | ||
146 | BTRFS_FT_DIR, index); | ||
147 | if (ret) | ||
148 | goto fail; | ||
149 | |||
150 | btrfs_i_size_write(dir, dir->i_size + namelen * 2); | ||
151 | ret = btrfs_update_inode(trans, root, dir); | ||
152 | BUG_ON(ret); | ||
153 | |||
154 | /* add the backref first */ | ||
155 | ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, | ||
156 | objectid, BTRFS_ROOT_BACKREF_KEY, | ||
157 | root->root_key.objectid, | ||
158 | dir->i_ino, index, name, namelen); | ||
159 | |||
160 | BUG_ON(ret); | ||
161 | |||
162 | /* now add the forward ref */ | ||
163 | ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, | ||
164 | root->root_key.objectid, BTRFS_ROOT_REF_KEY, | ||
165 | objectid, | ||
166 | dir->i_ino, index, name, namelen); | ||
167 | |||
168 | BUG_ON(ret); | ||
169 | |||
170 | ret = btrfs_commit_transaction(trans, root); | ||
171 | if (ret) | ||
172 | goto fail_commit; | ||
173 | |||
174 | new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); | ||
175 | BUG_ON(!new_root); | ||
176 | |||
177 | trans = btrfs_start_transaction(new_root, 1); | ||
178 | BUG_ON(!trans); | ||
179 | |||
180 | ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, | ||
181 | BTRFS_I(dir)->block_group); | ||
182 | if (ret) | ||
183 | goto fail; | ||
184 | |||
185 | fail: | ||
186 | nr = trans->blocks_used; | ||
187 | err = btrfs_commit_transaction(trans, new_root); | ||
188 | if (err && !ret) | ||
189 | ret = err; | ||
190 | fail_commit: | ||
191 | btrfs_btree_balance_dirty(root, nr); | ||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | ||
196 | char *name, int namelen) | ||
197 | { | ||
198 | struct btrfs_pending_snapshot *pending_snapshot; | ||
199 | struct btrfs_trans_handle *trans; | ||
200 | int ret = 0; | ||
201 | int err; | ||
202 | unsigned long nr = 0; | ||
203 | |||
204 | if (!root->ref_cows) | ||
205 | return -EINVAL; | ||
206 | |||
207 | ret = btrfs_check_free_space(root, 1, 0); | ||
208 | if (ret) | ||
209 | goto fail_unlock; | ||
210 | |||
211 | pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); | ||
212 | if (!pending_snapshot) { | ||
213 | ret = -ENOMEM; | ||
214 | goto fail_unlock; | ||
215 | } | ||
216 | pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); | ||
217 | if (!pending_snapshot->name) { | ||
218 | ret = -ENOMEM; | ||
219 | kfree(pending_snapshot); | ||
220 | goto fail_unlock; | ||
221 | } | ||
222 | memcpy(pending_snapshot->name, name, namelen); | ||
223 | pending_snapshot->name[namelen] = '\0'; | ||
224 | pending_snapshot->dentry = dentry; | ||
225 | trans = btrfs_start_transaction(root, 1); | ||
226 | BUG_ON(!trans); | ||
227 | pending_snapshot->root = root; | ||
228 | list_add(&pending_snapshot->list, | ||
229 | &trans->transaction->pending_snapshots); | ||
230 | err = btrfs_commit_transaction(trans, root); | ||
231 | |||
232 | fail_unlock: | ||
233 | btrfs_btree_balance_dirty(root, nr); | ||
234 | return ret; | ||
235 | } | ||
236 | |||
237 | /* copy of may_create in fs/namei.c() */ | ||
238 | static inline int btrfs_may_create(struct inode *dir, struct dentry *child) | ||
239 | { | ||
240 | if (child->d_inode) | ||
241 | return -EEXIST; | ||
242 | if (IS_DEADDIR(dir)) | ||
243 | return -ENOENT; | ||
244 | return inode_permission(dir, MAY_WRITE | MAY_EXEC); | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * Create a new subvolume below @parent. This is largely modeled after | ||
249 | * sys_mkdirat and vfs_mkdir, but we only do a single component lookup | ||
250 | * inside this filesystem so it's quite a bit simpler. | ||
251 | */ | ||
252 | static noinline int btrfs_mksubvol(struct path *parent, char *name, | ||
253 | int mode, int namelen, | ||
254 | struct btrfs_root *snap_src) | ||
255 | { | ||
256 | struct dentry *dentry; | ||
257 | int error; | ||
258 | |||
259 | mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); | ||
260 | |||
261 | dentry = lookup_one_len(name, parent->dentry, namelen); | ||
262 | error = PTR_ERR(dentry); | ||
263 | if (IS_ERR(dentry)) | ||
264 | goto out_unlock; | ||
265 | |||
266 | error = -EEXIST; | ||
267 | if (dentry->d_inode) | ||
268 | goto out_dput; | ||
269 | |||
270 | if (!IS_POSIXACL(parent->dentry->d_inode)) | ||
271 | mode &= ~current->fs->umask; | ||
272 | |||
273 | error = mnt_want_write(parent->mnt); | ||
274 | if (error) | ||
275 | goto out_dput; | ||
276 | |||
277 | error = btrfs_may_create(parent->dentry->d_inode, dentry); | ||
278 | if (error) | ||
279 | goto out_drop_write; | ||
280 | |||
281 | /* | ||
282 | * Actually perform the low-level subvolume creation after all | ||
283 | * this VFS fuzz. | ||
284 | * | ||
285 | * Eventually we want to pass in an inode under which we create this | ||
286 | * subvolume, but for now all are under the filesystem root. | ||
287 | * | ||
288 | * Also we should pass on the mode eventually to allow creating new | ||
289 | * subvolume with specific mode bits. | ||
290 | */ | ||
291 | if (snap_src) { | ||
292 | struct dentry *dir = dentry->d_parent; | ||
293 | struct dentry *test = dir->d_parent; | ||
294 | struct btrfs_path *path = btrfs_alloc_path(); | ||
295 | int ret; | ||
296 | u64 test_oid; | ||
297 | u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; | ||
298 | |||
299 | test_oid = snap_src->root_key.objectid; | ||
300 | |||
301 | ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, | ||
302 | path, parent_oid, test_oid); | ||
303 | if (ret == 0) | ||
304 | goto create; | ||
305 | btrfs_release_path(snap_src->fs_info->tree_root, path); | ||
306 | |||
307 | /* we need to make sure we aren't creating a directory loop | ||
308 | * by taking a snapshot of something that has our current | ||
309 | * subvol in its directory tree. So, this loops through | ||
310 | * the dentries and checks the forward refs for each subvolume | ||
311 | * to see if is references the subvolume where we are | ||
312 | * placing this new snapshot. | ||
313 | */ | ||
314 | while (1) { | ||
315 | if (!test || | ||
316 | dir == snap_src->fs_info->sb->s_root || | ||
317 | test == snap_src->fs_info->sb->s_root || | ||
318 | test->d_inode->i_sb != snap_src->fs_info->sb) { | ||
319 | break; | ||
320 | } | ||
321 | if (S_ISLNK(test->d_inode->i_mode)) { | ||
322 | printk(KERN_INFO "Btrfs symlink in snapshot " | ||
323 | "path, failed\n"); | ||
324 | error = -EMLINK; | ||
325 | btrfs_free_path(path); | ||
326 | goto out_drop_write; | ||
327 | } | ||
328 | test_oid = | ||
329 | BTRFS_I(test->d_inode)->root->root_key.objectid; | ||
330 | ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, | ||
331 | path, test_oid, parent_oid); | ||
332 | if (ret == 0) { | ||
333 | printk(KERN_INFO "Btrfs snapshot creation " | ||
334 | "failed, looping\n"); | ||
335 | error = -EMLINK; | ||
336 | btrfs_free_path(path); | ||
337 | goto out_drop_write; | ||
338 | } | ||
339 | btrfs_release_path(snap_src->fs_info->tree_root, path); | ||
340 | test = test->d_parent; | ||
341 | } | ||
342 | create: | ||
343 | btrfs_free_path(path); | ||
344 | error = create_snapshot(snap_src, dentry, name, namelen); | ||
345 | } else { | ||
346 | error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, | ||
347 | dentry, name, namelen); | ||
348 | } | ||
349 | if (error) | ||
350 | goto out_drop_write; | ||
351 | |||
352 | fsnotify_mkdir(parent->dentry->d_inode, dentry); | ||
353 | out_drop_write: | ||
354 | mnt_drop_write(parent->mnt); | ||
355 | out_dput: | ||
356 | dput(dentry); | ||
357 | out_unlock: | ||
358 | mutex_unlock(&parent->dentry->d_inode->i_mutex); | ||
359 | return error; | ||
360 | } | ||
361 | |||
362 | |||
363 | static int btrfs_defrag_file(struct file *file) | ||
364 | { | ||
365 | struct inode *inode = fdentry(file)->d_inode; | ||
366 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
367 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
368 | struct btrfs_ordered_extent *ordered; | ||
369 | struct page *page; | ||
370 | unsigned long last_index; | ||
371 | unsigned long ra_pages = root->fs_info->bdi.ra_pages; | ||
372 | unsigned long total_read = 0; | ||
373 | u64 page_start; | ||
374 | u64 page_end; | ||
375 | unsigned long i; | ||
376 | int ret; | ||
377 | |||
378 | ret = btrfs_check_free_space(root, inode->i_size, 0); | ||
379 | if (ret) | ||
380 | return -ENOSPC; | ||
381 | |||
382 | mutex_lock(&inode->i_mutex); | ||
383 | last_index = inode->i_size >> PAGE_CACHE_SHIFT; | ||
384 | for (i = 0; i <= last_index; i++) { | ||
385 | if (total_read % ra_pages == 0) { | ||
386 | btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, | ||
387 | min(last_index, i + ra_pages - 1)); | ||
388 | } | ||
389 | total_read++; | ||
390 | again: | ||
391 | page = grab_cache_page(inode->i_mapping, i); | ||
392 | if (!page) | ||
393 | goto out_unlock; | ||
394 | if (!PageUptodate(page)) { | ||
395 | btrfs_readpage(NULL, page); | ||
396 | lock_page(page); | ||
397 | if (!PageUptodate(page)) { | ||
398 | unlock_page(page); | ||
399 | page_cache_release(page); | ||
400 | goto out_unlock; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | wait_on_page_writeback(page); | ||
405 | |||
406 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; | ||
407 | page_end = page_start + PAGE_CACHE_SIZE - 1; | ||
408 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
409 | |||
410 | ordered = btrfs_lookup_ordered_extent(inode, page_start); | ||
411 | if (ordered) { | ||
412 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
413 | unlock_page(page); | ||
414 | page_cache_release(page); | ||
415 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
416 | btrfs_put_ordered_extent(ordered); | ||
417 | goto again; | ||
418 | } | ||
419 | set_page_extent_mapped(page); | ||
420 | |||
421 | /* | ||
422 | * this makes sure page_mkwrite is called on the | ||
423 | * page if it is dirtied again later | ||
424 | */ | ||
425 | clear_page_dirty_for_io(page); | ||
426 | |||
427 | btrfs_set_extent_delalloc(inode, page_start, page_end); | ||
428 | |||
429 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
430 | set_page_dirty(page); | ||
431 | unlock_page(page); | ||
432 | page_cache_release(page); | ||
433 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); | ||
434 | } | ||
435 | |||
436 | out_unlock: | ||
437 | mutex_unlock(&inode->i_mutex); | ||
438 | return 0; | ||
439 | } | ||
440 | |||
441 | /* | ||
442 | * Called inside transaction, so use GFP_NOFS | ||
443 | */ | ||
444 | |||
445 | static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) | ||
446 | { | ||
447 | u64 new_size; | ||
448 | u64 old_size; | ||
449 | u64 devid = 1; | ||
450 | struct btrfs_ioctl_vol_args *vol_args; | ||
451 | struct btrfs_trans_handle *trans; | ||
452 | struct btrfs_device *device = NULL; | ||
453 | char *sizestr; | ||
454 | char *devstr = NULL; | ||
455 | int ret = 0; | ||
456 | int namelen; | ||
457 | int mod = 0; | ||
458 | |||
459 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
460 | return -EROFS; | ||
461 | |||
462 | if (!capable(CAP_SYS_ADMIN)) | ||
463 | return -EPERM; | ||
464 | |||
465 | vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); | ||
466 | |||
467 | if (!vol_args) | ||
468 | return -ENOMEM; | ||
469 | |||
470 | if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { | ||
471 | ret = -EFAULT; | ||
472 | goto out; | ||
473 | } | ||
474 | |||
475 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | ||
476 | namelen = strlen(vol_args->name); | ||
477 | |||
478 | mutex_lock(&root->fs_info->volume_mutex); | ||
479 | sizestr = vol_args->name; | ||
480 | devstr = strchr(sizestr, ':'); | ||
481 | if (devstr) { | ||
482 | char *end; | ||
483 | sizestr = devstr + 1; | ||
484 | *devstr = '\0'; | ||
485 | devstr = vol_args->name; | ||
486 | devid = simple_strtoull(devstr, &end, 10); | ||
487 | printk(KERN_INFO "resizing devid %llu\n", devid); | ||
488 | } | ||
489 | device = btrfs_find_device(root, devid, NULL, NULL); | ||
490 | if (!device) { | ||
491 | printk(KERN_INFO "resizer unable to find device %llu\n", devid); | ||
492 | ret = -EINVAL; | ||
493 | goto out_unlock; | ||
494 | } | ||
495 | if (!strcmp(sizestr, "max")) | ||
496 | new_size = device->bdev->bd_inode->i_size; | ||
497 | else { | ||
498 | if (sizestr[0] == '-') { | ||
499 | mod = -1; | ||
500 | sizestr++; | ||
501 | } else if (sizestr[0] == '+') { | ||
502 | mod = 1; | ||
503 | sizestr++; | ||
504 | } | ||
505 | new_size = btrfs_parse_size(sizestr); | ||
506 | if (new_size == 0) { | ||
507 | ret = -EINVAL; | ||
508 | goto out_unlock; | ||
509 | } | ||
510 | } | ||
511 | |||
512 | old_size = device->total_bytes; | ||
513 | |||
514 | if (mod < 0) { | ||
515 | if (new_size > old_size) { | ||
516 | ret = -EINVAL; | ||
517 | goto out_unlock; | ||
518 | } | ||
519 | new_size = old_size - new_size; | ||
520 | } else if (mod > 0) { | ||
521 | new_size = old_size + new_size; | ||
522 | } | ||
523 | |||
524 | if (new_size < 256 * 1024 * 1024) { | ||
525 | ret = -EINVAL; | ||
526 | goto out_unlock; | ||
527 | } | ||
528 | if (new_size > device->bdev->bd_inode->i_size) { | ||
529 | ret = -EFBIG; | ||
530 | goto out_unlock; | ||
531 | } | ||
532 | |||
533 | do_div(new_size, root->sectorsize); | ||
534 | new_size *= root->sectorsize; | ||
535 | |||
536 | printk(KERN_INFO "new size for %s is %llu\n", | ||
537 | device->name, (unsigned long long)new_size); | ||
538 | |||
539 | if (new_size > old_size) { | ||
540 | trans = btrfs_start_transaction(root, 1); | ||
541 | ret = btrfs_grow_device(trans, device, new_size); | ||
542 | btrfs_commit_transaction(trans, root); | ||
543 | } else { | ||
544 | ret = btrfs_shrink_device(device, new_size); | ||
545 | } | ||
546 | |||
547 | out_unlock: | ||
548 | mutex_unlock(&root->fs_info->volume_mutex); | ||
549 | out: | ||
550 | kfree(vol_args); | ||
551 | return ret; | ||
552 | } | ||
553 | |||
554 | static noinline int btrfs_ioctl_snap_create(struct file *file, | ||
555 | void __user *arg, int subvol) | ||
556 | { | ||
557 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
558 | struct btrfs_ioctl_vol_args *vol_args; | ||
559 | struct btrfs_dir_item *di; | ||
560 | struct btrfs_path *path; | ||
561 | struct file *src_file; | ||
562 | u64 root_dirid; | ||
563 | int namelen; | ||
564 | int ret = 0; | ||
565 | |||
566 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
567 | return -EROFS; | ||
568 | |||
569 | vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); | ||
570 | |||
571 | if (!vol_args) | ||
572 | return -ENOMEM; | ||
573 | |||
574 | if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { | ||
575 | ret = -EFAULT; | ||
576 | goto out; | ||
577 | } | ||
578 | |||
579 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | ||
580 | namelen = strlen(vol_args->name); | ||
581 | if (strchr(vol_args->name, '/')) { | ||
582 | ret = -EINVAL; | ||
583 | goto out; | ||
584 | } | ||
585 | |||
586 | path = btrfs_alloc_path(); | ||
587 | if (!path) { | ||
588 | ret = -ENOMEM; | ||
589 | goto out; | ||
590 | } | ||
591 | |||
592 | root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, | ||
593 | di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, | ||
594 | path, root_dirid, | ||
595 | vol_args->name, namelen, 0); | ||
596 | btrfs_free_path(path); | ||
597 | |||
598 | if (di && !IS_ERR(di)) { | ||
599 | ret = -EEXIST; | ||
600 | goto out; | ||
601 | } | ||
602 | |||
603 | if (IS_ERR(di)) { | ||
604 | ret = PTR_ERR(di); | ||
605 | goto out; | ||
606 | } | ||
607 | |||
608 | if (subvol) { | ||
609 | ret = btrfs_mksubvol(&file->f_path, vol_args->name, | ||
610 | file->f_path.dentry->d_inode->i_mode, | ||
611 | namelen, NULL); | ||
612 | } else { | ||
613 | struct inode *src_inode; | ||
614 | src_file = fget(vol_args->fd); | ||
615 | if (!src_file) { | ||
616 | ret = -EINVAL; | ||
617 | goto out; | ||
618 | } | ||
619 | |||
620 | src_inode = src_file->f_path.dentry->d_inode; | ||
621 | if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { | ||
622 | printk(KERN_INFO "btrfs: Snapshot src from " | ||
623 | "another FS\n"); | ||
624 | ret = -EINVAL; | ||
625 | fput(src_file); | ||
626 | goto out; | ||
627 | } | ||
628 | ret = btrfs_mksubvol(&file->f_path, vol_args->name, | ||
629 | file->f_path.dentry->d_inode->i_mode, | ||
630 | namelen, BTRFS_I(src_inode)->root); | ||
631 | fput(src_file); | ||
632 | } | ||
633 | |||
634 | out: | ||
635 | kfree(vol_args); | ||
636 | return ret; | ||
637 | } | ||
638 | |||
639 | static int btrfs_ioctl_defrag(struct file *file) | ||
640 | { | ||
641 | struct inode *inode = fdentry(file)->d_inode; | ||
642 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
643 | int ret; | ||
644 | |||
645 | ret = mnt_want_write(file->f_path.mnt); | ||
646 | if (ret) | ||
647 | return ret; | ||
648 | |||
649 | switch (inode->i_mode & S_IFMT) { | ||
650 | case S_IFDIR: | ||
651 | if (!capable(CAP_SYS_ADMIN)) { | ||
652 | ret = -EPERM; | ||
653 | goto out; | ||
654 | } | ||
655 | btrfs_defrag_root(root, 0); | ||
656 | btrfs_defrag_root(root->fs_info->extent_root, 0); | ||
657 | break; | ||
658 | case S_IFREG: | ||
659 | if (!(file->f_mode & FMODE_WRITE)) { | ||
660 | ret = -EINVAL; | ||
661 | goto out; | ||
662 | } | ||
663 | btrfs_defrag_file(file); | ||
664 | break; | ||
665 | } | ||
666 | out: | ||
667 | mnt_drop_write(file->f_path.mnt); | ||
668 | return ret; | ||
669 | } | ||
670 | |||
671 | static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) | ||
672 | { | ||
673 | struct btrfs_ioctl_vol_args *vol_args; | ||
674 | int ret; | ||
675 | |||
676 | if (!capable(CAP_SYS_ADMIN)) | ||
677 | return -EPERM; | ||
678 | |||
679 | vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); | ||
680 | |||
681 | if (!vol_args) | ||
682 | return -ENOMEM; | ||
683 | |||
684 | if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { | ||
685 | ret = -EFAULT; | ||
686 | goto out; | ||
687 | } | ||
688 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | ||
689 | ret = btrfs_init_new_device(root, vol_args->name); | ||
690 | |||
691 | out: | ||
692 | kfree(vol_args); | ||
693 | return ret; | ||
694 | } | ||
695 | |||
696 | static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) | ||
697 | { | ||
698 | struct btrfs_ioctl_vol_args *vol_args; | ||
699 | int ret; | ||
700 | |||
701 | if (!capable(CAP_SYS_ADMIN)) | ||
702 | return -EPERM; | ||
703 | |||
704 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
705 | return -EROFS; | ||
706 | |||
707 | vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); | ||
708 | |||
709 | if (!vol_args) | ||
710 | return -ENOMEM; | ||
711 | |||
712 | if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { | ||
713 | ret = -EFAULT; | ||
714 | goto out; | ||
715 | } | ||
716 | vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; | ||
717 | ret = btrfs_rm_device(root, vol_args->name); | ||
718 | |||
719 | out: | ||
720 | kfree(vol_args); | ||
721 | return ret; | ||
722 | } | ||
723 | |||
724 | static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, | ||
725 | u64 off, u64 olen, u64 destoff) | ||
726 | { | ||
727 | struct inode *inode = fdentry(file)->d_inode; | ||
728 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
729 | struct file *src_file; | ||
730 | struct inode *src; | ||
731 | struct btrfs_trans_handle *trans; | ||
732 | struct btrfs_path *path; | ||
733 | struct extent_buffer *leaf; | ||
734 | char *buf; | ||
735 | struct btrfs_key key; | ||
736 | u32 nritems; | ||
737 | int slot; | ||
738 | int ret; | ||
739 | u64 len = olen; | ||
740 | u64 bs = root->fs_info->sb->s_blocksize; | ||
741 | u64 hint_byte; | ||
742 | |||
743 | /* | ||
744 | * TODO: | ||
745 | * - split compressed inline extents. annoying: we need to | ||
746 | * decompress into destination's address_space (the file offset | ||
747 | * may change, so source mapping won't do), then recompress (or | ||
748 | * otherwise reinsert) a subrange. | ||
749 | * - allow ranges within the same file to be cloned (provided | ||
750 | * they don't overlap)? | ||
751 | */ | ||
752 | |||
753 | /* the destination must be opened for writing */ | ||
754 | if (!(file->f_mode & FMODE_WRITE)) | ||
755 | return -EINVAL; | ||
756 | |||
757 | ret = mnt_want_write(file->f_path.mnt); | ||
758 | if (ret) | ||
759 | return ret; | ||
760 | |||
761 | src_file = fget(srcfd); | ||
762 | if (!src_file) { | ||
763 | ret = -EBADF; | ||
764 | goto out_drop_write; | ||
765 | } | ||
766 | src = src_file->f_dentry->d_inode; | ||
767 | |||
768 | ret = -EINVAL; | ||
769 | if (src == inode) | ||
770 | goto out_fput; | ||
771 | |||
772 | ret = -EISDIR; | ||
773 | if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) | ||
774 | goto out_fput; | ||
775 | |||
776 | ret = -EXDEV; | ||
777 | if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) | ||
778 | goto out_fput; | ||
779 | |||
780 | ret = -ENOMEM; | ||
781 | buf = vmalloc(btrfs_level_size(root, 0)); | ||
782 | if (!buf) | ||
783 | goto out_fput; | ||
784 | |||
785 | path = btrfs_alloc_path(); | ||
786 | if (!path) { | ||
787 | vfree(buf); | ||
788 | goto out_fput; | ||
789 | } | ||
790 | path->reada = 2; | ||
791 | |||
792 | if (inode < src) { | ||
793 | mutex_lock(&inode->i_mutex); | ||
794 | mutex_lock(&src->i_mutex); | ||
795 | } else { | ||
796 | mutex_lock(&src->i_mutex); | ||
797 | mutex_lock(&inode->i_mutex); | ||
798 | } | ||
799 | |||
800 | /* determine range to clone */ | ||
801 | ret = -EINVAL; | ||
802 | if (off >= src->i_size || off + len > src->i_size) | ||
803 | goto out_unlock; | ||
804 | if (len == 0) | ||
805 | olen = len = src->i_size - off; | ||
806 | /* if we extend to eof, continue to block boundary */ | ||
807 | if (off + len == src->i_size) | ||
808 | len = ((src->i_size + bs-1) & ~(bs-1)) | ||
809 | - off; | ||
810 | |||
811 | /* verify the end result is block aligned */ | ||
812 | if ((off & (bs-1)) || | ||
813 | ((off + len) & (bs-1))) | ||
814 | goto out_unlock; | ||
815 | |||
816 | /* do any pending delalloc/csum calc on src, one way or | ||
817 | another, and lock file content */ | ||
818 | while (1) { | ||
819 | struct btrfs_ordered_extent *ordered; | ||
820 | lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); | ||
821 | ordered = btrfs_lookup_first_ordered_extent(inode, off+len); | ||
822 | if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) | ||
823 | break; | ||
824 | unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); | ||
825 | if (ordered) | ||
826 | btrfs_put_ordered_extent(ordered); | ||
827 | btrfs_wait_ordered_range(src, off, off+len); | ||
828 | } | ||
829 | |||
830 | trans = btrfs_start_transaction(root, 1); | ||
831 | BUG_ON(!trans); | ||
832 | |||
833 | /* punch hole in destination first */ | ||
834 | btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); | ||
835 | |||
836 | /* clone data */ | ||
837 | key.objectid = src->i_ino; | ||
838 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
839 | key.offset = 0; | ||
840 | |||
841 | while (1) { | ||
842 | /* | ||
843 | * note the key will change type as we walk through the | ||
844 | * tree. | ||
845 | */ | ||
846 | ret = btrfs_search_slot(trans, root, &key, path, 0, 0); | ||
847 | if (ret < 0) | ||
848 | goto out; | ||
849 | |||
850 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
851 | if (path->slots[0] >= nritems) { | ||
852 | ret = btrfs_next_leaf(root, path); | ||
853 | if (ret < 0) | ||
854 | goto out; | ||
855 | if (ret > 0) | ||
856 | break; | ||
857 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
858 | } | ||
859 | leaf = path->nodes[0]; | ||
860 | slot = path->slots[0]; | ||
861 | |||
862 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
863 | if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || | ||
864 | key.objectid != src->i_ino) | ||
865 | break; | ||
866 | |||
867 | if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { | ||
868 | struct btrfs_file_extent_item *extent; | ||
869 | int type; | ||
870 | u32 size; | ||
871 | struct btrfs_key new_key; | ||
872 | u64 disko = 0, diskl = 0; | ||
873 | u64 datao = 0, datal = 0; | ||
874 | u8 comp; | ||
875 | |||
876 | size = btrfs_item_size_nr(leaf, slot); | ||
877 | read_extent_buffer(leaf, buf, | ||
878 | btrfs_item_ptr_offset(leaf, slot), | ||
879 | size); | ||
880 | |||
881 | extent = btrfs_item_ptr(leaf, slot, | ||
882 | struct btrfs_file_extent_item); | ||
883 | comp = btrfs_file_extent_compression(leaf, extent); | ||
884 | type = btrfs_file_extent_type(leaf, extent); | ||
885 | if (type == BTRFS_FILE_EXTENT_REG) { | ||
886 | disko = btrfs_file_extent_disk_bytenr(leaf, | ||
887 | extent); | ||
888 | diskl = btrfs_file_extent_disk_num_bytes(leaf, | ||
889 | extent); | ||
890 | datao = btrfs_file_extent_offset(leaf, extent); | ||
891 | datal = btrfs_file_extent_num_bytes(leaf, | ||
892 | extent); | ||
893 | } else if (type == BTRFS_FILE_EXTENT_INLINE) { | ||
894 | /* take upper bound, may be compressed */ | ||
895 | datal = btrfs_file_extent_ram_bytes(leaf, | ||
896 | extent); | ||
897 | } | ||
898 | btrfs_release_path(root, path); | ||
899 | |||
900 | if (key.offset + datal < off || | ||
901 | key.offset >= off+len) | ||
902 | goto next; | ||
903 | |||
904 | memcpy(&new_key, &key, sizeof(new_key)); | ||
905 | new_key.objectid = inode->i_ino; | ||
906 | new_key.offset = key.offset + destoff - off; | ||
907 | |||
908 | if (type == BTRFS_FILE_EXTENT_REG) { | ||
909 | ret = btrfs_insert_empty_item(trans, root, path, | ||
910 | &new_key, size); | ||
911 | if (ret) | ||
912 | goto out; | ||
913 | |||
914 | leaf = path->nodes[0]; | ||
915 | slot = path->slots[0]; | ||
916 | write_extent_buffer(leaf, buf, | ||
917 | btrfs_item_ptr_offset(leaf, slot), | ||
918 | size); | ||
919 | |||
920 | extent = btrfs_item_ptr(leaf, slot, | ||
921 | struct btrfs_file_extent_item); | ||
922 | |||
923 | if (off > key.offset) { | ||
924 | datao += off - key.offset; | ||
925 | datal -= off - key.offset; | ||
926 | } | ||
927 | if (key.offset + datao + datal + key.offset > | ||
928 | off + len) | ||
929 | datal = off + len - key.offset - datao; | ||
930 | /* disko == 0 means it's a hole */ | ||
931 | if (!disko) | ||
932 | datao = 0; | ||
933 | |||
934 | btrfs_set_file_extent_offset(leaf, extent, | ||
935 | datao); | ||
936 | btrfs_set_file_extent_num_bytes(leaf, extent, | ||
937 | datal); | ||
938 | if (disko) { | ||
939 | inode_add_bytes(inode, datal); | ||
940 | ret = btrfs_inc_extent_ref(trans, root, | ||
941 | disko, diskl, leaf->start, | ||
942 | root->root_key.objectid, | ||
943 | trans->transid, | ||
944 | inode->i_ino); | ||
945 | BUG_ON(ret); | ||
946 | } | ||
947 | } else if (type == BTRFS_FILE_EXTENT_INLINE) { | ||
948 | u64 skip = 0; | ||
949 | u64 trim = 0; | ||
950 | if (off > key.offset) { | ||
951 | skip = off - key.offset; | ||
952 | new_key.offset += skip; | ||
953 | } | ||
954 | |||
955 | if (key.offset + datal > off+len) | ||
956 | trim = key.offset + datal - (off+len); | ||
957 | |||
958 | if (comp && (skip || trim)) { | ||
959 | ret = -EINVAL; | ||
960 | goto out; | ||
961 | } | ||
962 | size -= skip + trim; | ||
963 | datal -= skip + trim; | ||
964 | ret = btrfs_insert_empty_item(trans, root, path, | ||
965 | &new_key, size); | ||
966 | if (ret) | ||
967 | goto out; | ||
968 | |||
969 | if (skip) { | ||
970 | u32 start = | ||
971 | btrfs_file_extent_calc_inline_size(0); | ||
972 | memmove(buf+start, buf+start+skip, | ||
973 | datal); | ||
974 | } | ||
975 | |||
976 | leaf = path->nodes[0]; | ||
977 | slot = path->slots[0]; | ||
978 | write_extent_buffer(leaf, buf, | ||
979 | btrfs_item_ptr_offset(leaf, slot), | ||
980 | size); | ||
981 | inode_add_bytes(inode, datal); | ||
982 | } | ||
983 | |||
984 | btrfs_mark_buffer_dirty(leaf); | ||
985 | } | ||
986 | |||
987 | next: | ||
988 | btrfs_release_path(root, path); | ||
989 | key.offset++; | ||
990 | } | ||
991 | ret = 0; | ||
992 | out: | ||
993 | btrfs_release_path(root, path); | ||
994 | if (ret == 0) { | ||
995 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
996 | if (destoff + olen > inode->i_size) | ||
997 | btrfs_i_size_write(inode, destoff + olen); | ||
998 | BTRFS_I(inode)->flags = BTRFS_I(src)->flags; | ||
999 | ret = btrfs_update_inode(trans, root, inode); | ||
1000 | } | ||
1001 | btrfs_end_transaction(trans, root); | ||
1002 | unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); | ||
1003 | if (ret) | ||
1004 | vmtruncate(inode, 0); | ||
1005 | out_unlock: | ||
1006 | mutex_unlock(&src->i_mutex); | ||
1007 | mutex_unlock(&inode->i_mutex); | ||
1008 | vfree(buf); | ||
1009 | btrfs_free_path(path); | ||
1010 | out_fput: | ||
1011 | fput(src_file); | ||
1012 | out_drop_write: | ||
1013 | mnt_drop_write(file->f_path.mnt); | ||
1014 | return ret; | ||
1015 | } | ||
1016 | |||
1017 | static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) | ||
1018 | { | ||
1019 | struct btrfs_ioctl_clone_range_args args; | ||
1020 | |||
1021 | if (copy_from_user(&args, argp, sizeof(args))) | ||
1022 | return -EFAULT; | ||
1023 | return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, | ||
1024 | args.src_length, args.dest_offset); | ||
1025 | } | ||
1026 | |||
1027 | /* | ||
1028 | * there are many ways the trans_start and trans_end ioctls can lead | ||
1029 | * to deadlocks. They should only be used by applications that | ||
1030 | * basically own the machine, and have a very in depth understanding | ||
1031 | * of all the possible deadlocks and enospc problems. | ||
1032 | */ | ||
1033 | static long btrfs_ioctl_trans_start(struct file *file) | ||
1034 | { | ||
1035 | struct inode *inode = fdentry(file)->d_inode; | ||
1036 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1037 | struct btrfs_trans_handle *trans; | ||
1038 | int ret = 0; | ||
1039 | |||
1040 | if (!capable(CAP_SYS_ADMIN)) | ||
1041 | return -EPERM; | ||
1042 | |||
1043 | if (file->private_data) { | ||
1044 | ret = -EINPROGRESS; | ||
1045 | goto out; | ||
1046 | } | ||
1047 | |||
1048 | ret = mnt_want_write(file->f_path.mnt); | ||
1049 | if (ret) | ||
1050 | goto out; | ||
1051 | |||
1052 | mutex_lock(&root->fs_info->trans_mutex); | ||
1053 | root->fs_info->open_ioctl_trans++; | ||
1054 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1055 | |||
1056 | trans = btrfs_start_ioctl_transaction(root, 0); | ||
1057 | if (trans) | ||
1058 | file->private_data = trans; | ||
1059 | else | ||
1060 | ret = -ENOMEM; | ||
1061 | /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ | ||
1062 | out: | ||
1063 | return ret; | ||
1064 | } | ||
1065 | |||
1066 | /* | ||
1067 | * there are many ways the trans_start and trans_end ioctls can lead | ||
1068 | * to deadlocks. They should only be used by applications that | ||
1069 | * basically own the machine, and have a very in depth understanding | ||
1070 | * of all the possible deadlocks and enospc problems. | ||
1071 | */ | ||
1072 | long btrfs_ioctl_trans_end(struct file *file) | ||
1073 | { | ||
1074 | struct inode *inode = fdentry(file)->d_inode; | ||
1075 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
1076 | struct btrfs_trans_handle *trans; | ||
1077 | int ret = 0; | ||
1078 | |||
1079 | trans = file->private_data; | ||
1080 | if (!trans) { | ||
1081 | ret = -EINVAL; | ||
1082 | goto out; | ||
1083 | } | ||
1084 | btrfs_end_transaction(trans, root); | ||
1085 | file->private_data = NULL; | ||
1086 | |||
1087 | mutex_lock(&root->fs_info->trans_mutex); | ||
1088 | root->fs_info->open_ioctl_trans--; | ||
1089 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1090 | |||
1091 | mnt_drop_write(file->f_path.mnt); | ||
1092 | |||
1093 | out: | ||
1094 | return ret; | ||
1095 | } | ||
1096 | |||
1097 | long btrfs_ioctl(struct file *file, unsigned int | ||
1098 | cmd, unsigned long arg) | ||
1099 | { | ||
1100 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
1101 | void __user *argp = (void __user *)arg; | ||
1102 | |||
1103 | switch (cmd) { | ||
1104 | case BTRFS_IOC_SNAP_CREATE: | ||
1105 | return btrfs_ioctl_snap_create(file, argp, 0); | ||
1106 | case BTRFS_IOC_SUBVOL_CREATE: | ||
1107 | return btrfs_ioctl_snap_create(file, argp, 1); | ||
1108 | case BTRFS_IOC_DEFRAG: | ||
1109 | return btrfs_ioctl_defrag(file); | ||
1110 | case BTRFS_IOC_RESIZE: | ||
1111 | return btrfs_ioctl_resize(root, argp); | ||
1112 | case BTRFS_IOC_ADD_DEV: | ||
1113 | return btrfs_ioctl_add_dev(root, argp); | ||
1114 | case BTRFS_IOC_RM_DEV: | ||
1115 | return btrfs_ioctl_rm_dev(root, argp); | ||
1116 | case BTRFS_IOC_BALANCE: | ||
1117 | return btrfs_balance(root->fs_info->dev_root); | ||
1118 | case BTRFS_IOC_CLONE: | ||
1119 | return btrfs_ioctl_clone(file, arg, 0, 0, 0); | ||
1120 | case BTRFS_IOC_CLONE_RANGE: | ||
1121 | return btrfs_ioctl_clone_range(file, argp); | ||
1122 | case BTRFS_IOC_TRANS_START: | ||
1123 | return btrfs_ioctl_trans_start(file); | ||
1124 | case BTRFS_IOC_TRANS_END: | ||
1125 | return btrfs_ioctl_trans_end(file); | ||
1126 | case BTRFS_IOC_SYNC: | ||
1127 | btrfs_sync_fs(file->f_dentry->d_sb, 1); | ||
1128 | return 0; | ||
1129 | } | ||
1130 | |||
1131 | return -ENOTTY; | ||
1132 | } | ||
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..78049ea208db --- /dev/null +++ b/fs/btrfs/ioctl.h | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __IOCTL_ | ||
20 | #define __IOCTL_ | ||
21 | #include <linux/ioctl.h> | ||
22 | |||
23 | #define BTRFS_IOCTL_MAGIC 0x94 | ||
24 | #define BTRFS_VOL_NAME_MAX 255 | ||
25 | #define BTRFS_PATH_NAME_MAX 3072 | ||
26 | |||
27 | struct btrfs_ioctl_vol_args { | ||
28 | __s64 fd; | ||
29 | char name[BTRFS_PATH_NAME_MAX + 1]; | ||
30 | }; | ||
31 | |||
32 | #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ | ||
33 | struct btrfs_ioctl_vol_args) | ||
34 | #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ | ||
35 | struct btrfs_ioctl_vol_args) | ||
36 | #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ | ||
37 | struct btrfs_ioctl_vol_args) | ||
38 | #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ | ||
39 | struct btrfs_ioctl_vol_args) | ||
40 | /* trans start and trans end are dangerous, and only for | ||
41 | * use by applications that know how to avoid the | ||
42 | * resulting deadlocks | ||
43 | */ | ||
44 | #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) | ||
45 | #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) | ||
46 | #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) | ||
47 | |||
48 | #define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) | ||
49 | #define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ | ||
50 | struct btrfs_ioctl_vol_args) | ||
51 | #define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ | ||
52 | struct btrfs_ioctl_vol_args) | ||
53 | #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ | ||
54 | struct btrfs_ioctl_vol_args) | ||
55 | struct btrfs_ioctl_clone_range_args { | ||
56 | __s64 src_fd; | ||
57 | __u64 src_offset, src_length; | ||
58 | __u64 dest_offset; | ||
59 | }; | ||
60 | |||
61 | #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ | ||
62 | struct btrfs_ioctl_clone_range_args) | ||
63 | |||
64 | #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ | ||
65 | struct btrfs_ioctl_vol_args) | ||
66 | |||
67 | #endif | ||
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 000000000000..39bae7761db6 --- /dev/null +++ b/fs/btrfs/locking.c | |||
@@ -0,0 +1,88 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/gfp.h> | ||
20 | #include <linux/pagemap.h> | ||
21 | #include <linux/spinlock.h> | ||
22 | #include <linux/page-flags.h> | ||
23 | #include <asm/bug.h> | ||
24 | #include "ctree.h" | ||
25 | #include "extent_io.h" | ||
26 | #include "locking.h" | ||
27 | |||
28 | /* | ||
29 | * locks the per buffer mutex in an extent buffer. This uses adaptive locks | ||
30 | * and the spin is not tuned very extensively. The spinning does make a big | ||
31 | * difference in almost every workload, but spinning for the right amount of | ||
32 | * time needs some help. | ||
33 | * | ||
34 | * In general, we want to spin as long as the lock holder is doing btree | ||
35 | * searches, and we should give up if they are in more expensive code. | ||
36 | */ | ||
37 | |||
38 | int btrfs_tree_lock(struct extent_buffer *eb) | ||
39 | { | ||
40 | int i; | ||
41 | |||
42 | if (mutex_trylock(&eb->mutex)) | ||
43 | return 0; | ||
44 | for (i = 0; i < 512; i++) { | ||
45 | cpu_relax(); | ||
46 | if (mutex_trylock(&eb->mutex)) | ||
47 | return 0; | ||
48 | } | ||
49 | cpu_relax(); | ||
50 | mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); | ||
51 | return 0; | ||
52 | } | ||
53 | |||
54 | int btrfs_try_tree_lock(struct extent_buffer *eb) | ||
55 | { | ||
56 | return mutex_trylock(&eb->mutex); | ||
57 | } | ||
58 | |||
59 | int btrfs_tree_unlock(struct extent_buffer *eb) | ||
60 | { | ||
61 | mutex_unlock(&eb->mutex); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | int btrfs_tree_locked(struct extent_buffer *eb) | ||
66 | { | ||
67 | return mutex_is_locked(&eb->mutex); | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * btrfs_search_slot uses this to decide if it should drop its locks | ||
72 | * before doing something expensive like allocating free blocks for cow. | ||
73 | */ | ||
74 | int btrfs_path_lock_waiting(struct btrfs_path *path, int level) | ||
75 | { | ||
76 | int i; | ||
77 | struct extent_buffer *eb; | ||
78 | for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { | ||
79 | eb = path->nodes[i]; | ||
80 | if (!eb) | ||
81 | break; | ||
82 | smp_mb(); | ||
83 | if (!list_empty(&eb->mutex.wait_list)) | ||
84 | return 1; | ||
85 | } | ||
86 | return 0; | ||
87 | } | ||
88 | |||
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 000000000000..bc1faef12519 --- /dev/null +++ b/fs/btrfs/locking.h | |||
@@ -0,0 +1,27 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_LOCKING_ | ||
20 | #define __BTRFS_LOCKING_ | ||
21 | |||
22 | int btrfs_tree_lock(struct extent_buffer *eb); | ||
23 | int btrfs_tree_unlock(struct extent_buffer *eb); | ||
24 | int btrfs_tree_locked(struct extent_buffer *eb); | ||
25 | int btrfs_try_tree_lock(struct extent_buffer *eb); | ||
26 | int btrfs_path_lock_waiting(struct btrfs_path *path, int level); | ||
27 | #endif | ||
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 000000000000..a20940170274 --- /dev/null +++ b/fs/btrfs/ordered-data.c | |||
@@ -0,0 +1,730 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/gfp.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/writeback.h> | ||
23 | #include <linux/pagevec.h> | ||
24 | #include "ctree.h" | ||
25 | #include "transaction.h" | ||
26 | #include "btrfs_inode.h" | ||
27 | #include "extent_io.h" | ||
28 | |||
29 | static u64 entry_end(struct btrfs_ordered_extent *entry) | ||
30 | { | ||
31 | if (entry->file_offset + entry->len < entry->file_offset) | ||
32 | return (u64)-1; | ||
33 | return entry->file_offset + entry->len; | ||
34 | } | ||
35 | |||
36 | /* returns NULL if the insertion worked, or it returns the node it did find | ||
37 | * in the tree | ||
38 | */ | ||
39 | static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, | ||
40 | struct rb_node *node) | ||
41 | { | ||
42 | struct rb_node **p = &root->rb_node; | ||
43 | struct rb_node *parent = NULL; | ||
44 | struct btrfs_ordered_extent *entry; | ||
45 | |||
46 | while (*p) { | ||
47 | parent = *p; | ||
48 | entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); | ||
49 | |||
50 | if (file_offset < entry->file_offset) | ||
51 | p = &(*p)->rb_left; | ||
52 | else if (file_offset >= entry_end(entry)) | ||
53 | p = &(*p)->rb_right; | ||
54 | else | ||
55 | return parent; | ||
56 | } | ||
57 | |||
58 | rb_link_node(node, parent, p); | ||
59 | rb_insert_color(node, root); | ||
60 | return NULL; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * look for a given offset in the tree, and if it can't be found return the | ||
65 | * first lesser offset | ||
66 | */ | ||
67 | static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, | ||
68 | struct rb_node **prev_ret) | ||
69 | { | ||
70 | struct rb_node *n = root->rb_node; | ||
71 | struct rb_node *prev = NULL; | ||
72 | struct rb_node *test; | ||
73 | struct btrfs_ordered_extent *entry; | ||
74 | struct btrfs_ordered_extent *prev_entry = NULL; | ||
75 | |||
76 | while (n) { | ||
77 | entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); | ||
78 | prev = n; | ||
79 | prev_entry = entry; | ||
80 | |||
81 | if (file_offset < entry->file_offset) | ||
82 | n = n->rb_left; | ||
83 | else if (file_offset >= entry_end(entry)) | ||
84 | n = n->rb_right; | ||
85 | else | ||
86 | return n; | ||
87 | } | ||
88 | if (!prev_ret) | ||
89 | return NULL; | ||
90 | |||
91 | while (prev && file_offset >= entry_end(prev_entry)) { | ||
92 | test = rb_next(prev); | ||
93 | if (!test) | ||
94 | break; | ||
95 | prev_entry = rb_entry(test, struct btrfs_ordered_extent, | ||
96 | rb_node); | ||
97 | if (file_offset < entry_end(prev_entry)) | ||
98 | break; | ||
99 | |||
100 | prev = test; | ||
101 | } | ||
102 | if (prev) | ||
103 | prev_entry = rb_entry(prev, struct btrfs_ordered_extent, | ||
104 | rb_node); | ||
105 | while (prev && file_offset < entry_end(prev_entry)) { | ||
106 | test = rb_prev(prev); | ||
107 | if (!test) | ||
108 | break; | ||
109 | prev_entry = rb_entry(test, struct btrfs_ordered_extent, | ||
110 | rb_node); | ||
111 | prev = test; | ||
112 | } | ||
113 | *prev_ret = prev; | ||
114 | return NULL; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * helper to check if a given offset is inside a given entry | ||
119 | */ | ||
120 | static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) | ||
121 | { | ||
122 | if (file_offset < entry->file_offset || | ||
123 | entry->file_offset + entry->len <= file_offset) | ||
124 | return 0; | ||
125 | return 1; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * look find the first ordered struct that has this offset, otherwise | ||
130 | * the first one less than this offset | ||
131 | */ | ||
132 | static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, | ||
133 | u64 file_offset) | ||
134 | { | ||
135 | struct rb_root *root = &tree->tree; | ||
136 | struct rb_node *prev; | ||
137 | struct rb_node *ret; | ||
138 | struct btrfs_ordered_extent *entry; | ||
139 | |||
140 | if (tree->last) { | ||
141 | entry = rb_entry(tree->last, struct btrfs_ordered_extent, | ||
142 | rb_node); | ||
143 | if (offset_in_entry(entry, file_offset)) | ||
144 | return tree->last; | ||
145 | } | ||
146 | ret = __tree_search(root, file_offset, &prev); | ||
147 | if (!ret) | ||
148 | ret = prev; | ||
149 | if (ret) | ||
150 | tree->last = ret; | ||
151 | return ret; | ||
152 | } | ||
153 | |||
154 | /* allocate and add a new ordered_extent into the per-inode tree. | ||
155 | * file_offset is the logical offset in the file | ||
156 | * | ||
157 | * start is the disk block number of an extent already reserved in the | ||
158 | * extent allocation tree | ||
159 | * | ||
160 | * len is the length of the extent | ||
161 | * | ||
162 | * This also sets the EXTENT_ORDERED bit on the range in the inode. | ||
163 | * | ||
164 | * The tree is given a single reference on the ordered extent that was | ||
165 | * inserted. | ||
166 | */ | ||
167 | int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | ||
168 | u64 start, u64 len, u64 disk_len, int type) | ||
169 | { | ||
170 | struct btrfs_ordered_inode_tree *tree; | ||
171 | struct rb_node *node; | ||
172 | struct btrfs_ordered_extent *entry; | ||
173 | |||
174 | tree = &BTRFS_I(inode)->ordered_tree; | ||
175 | entry = kzalloc(sizeof(*entry), GFP_NOFS); | ||
176 | if (!entry) | ||
177 | return -ENOMEM; | ||
178 | |||
179 | mutex_lock(&tree->mutex); | ||
180 | entry->file_offset = file_offset; | ||
181 | entry->start = start; | ||
182 | entry->len = len; | ||
183 | entry->disk_len = disk_len; | ||
184 | entry->inode = inode; | ||
185 | if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) | ||
186 | set_bit(type, &entry->flags); | ||
187 | |||
188 | /* one ref for the tree */ | ||
189 | atomic_set(&entry->refs, 1); | ||
190 | init_waitqueue_head(&entry->wait); | ||
191 | INIT_LIST_HEAD(&entry->list); | ||
192 | INIT_LIST_HEAD(&entry->root_extent_list); | ||
193 | |||
194 | node = tree_insert(&tree->tree, file_offset, | ||
195 | &entry->rb_node); | ||
196 | BUG_ON(node); | ||
197 | |||
198 | set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, | ||
199 | entry_end(entry) - 1, GFP_NOFS); | ||
200 | |||
201 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | ||
202 | list_add_tail(&entry->root_extent_list, | ||
203 | &BTRFS_I(inode)->root->fs_info->ordered_extents); | ||
204 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | ||
205 | |||
206 | mutex_unlock(&tree->mutex); | ||
207 | BUG_ON(node); | ||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Add a struct btrfs_ordered_sum into the list of checksums to be inserted | ||
213 | * when an ordered extent is finished. If the list covers more than one | ||
214 | * ordered extent, it is split across multiples. | ||
215 | */ | ||
216 | int btrfs_add_ordered_sum(struct inode *inode, | ||
217 | struct btrfs_ordered_extent *entry, | ||
218 | struct btrfs_ordered_sum *sum) | ||
219 | { | ||
220 | struct btrfs_ordered_inode_tree *tree; | ||
221 | |||
222 | tree = &BTRFS_I(inode)->ordered_tree; | ||
223 | mutex_lock(&tree->mutex); | ||
224 | list_add_tail(&sum->list, &entry->list); | ||
225 | mutex_unlock(&tree->mutex); | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * this is used to account for finished IO across a given range | ||
231 | * of the file. The IO should not span ordered extents. If | ||
232 | * a given ordered_extent is completely done, 1 is returned, otherwise | ||
233 | * 0. | ||
234 | * | ||
235 | * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used | ||
236 | * to make sure this function only returns 1 once for a given ordered extent. | ||
237 | */ | ||
238 | int btrfs_dec_test_ordered_pending(struct inode *inode, | ||
239 | u64 file_offset, u64 io_size) | ||
240 | { | ||
241 | struct btrfs_ordered_inode_tree *tree; | ||
242 | struct rb_node *node; | ||
243 | struct btrfs_ordered_extent *entry; | ||
244 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
245 | int ret; | ||
246 | |||
247 | tree = &BTRFS_I(inode)->ordered_tree; | ||
248 | mutex_lock(&tree->mutex); | ||
249 | clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, | ||
250 | GFP_NOFS); | ||
251 | node = tree_search(tree, file_offset); | ||
252 | if (!node) { | ||
253 | ret = 1; | ||
254 | goto out; | ||
255 | } | ||
256 | |||
257 | entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); | ||
258 | if (!offset_in_entry(entry, file_offset)) { | ||
259 | ret = 1; | ||
260 | goto out; | ||
261 | } | ||
262 | |||
263 | ret = test_range_bit(io_tree, entry->file_offset, | ||
264 | entry->file_offset + entry->len - 1, | ||
265 | EXTENT_ORDERED, 0); | ||
266 | if (ret == 0) | ||
267 | ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); | ||
268 | out: | ||
269 | mutex_unlock(&tree->mutex); | ||
270 | return ret == 0; | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * used to drop a reference on an ordered extent. This will free | ||
275 | * the extent if the last reference is dropped | ||
276 | */ | ||
277 | int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) | ||
278 | { | ||
279 | struct list_head *cur; | ||
280 | struct btrfs_ordered_sum *sum; | ||
281 | |||
282 | if (atomic_dec_and_test(&entry->refs)) { | ||
283 | while (!list_empty(&entry->list)) { | ||
284 | cur = entry->list.next; | ||
285 | sum = list_entry(cur, struct btrfs_ordered_sum, list); | ||
286 | list_del(&sum->list); | ||
287 | kfree(sum); | ||
288 | } | ||
289 | kfree(entry); | ||
290 | } | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | /* | ||
295 | * remove an ordered extent from the tree. No references are dropped | ||
296 | * but, anyone waiting on this extent is woken up. | ||
297 | */ | ||
298 | int btrfs_remove_ordered_extent(struct inode *inode, | ||
299 | struct btrfs_ordered_extent *entry) | ||
300 | { | ||
301 | struct btrfs_ordered_inode_tree *tree; | ||
302 | struct rb_node *node; | ||
303 | |||
304 | tree = &BTRFS_I(inode)->ordered_tree; | ||
305 | mutex_lock(&tree->mutex); | ||
306 | node = &entry->rb_node; | ||
307 | rb_erase(node, &tree->tree); | ||
308 | tree->last = NULL; | ||
309 | set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); | ||
310 | |||
311 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | ||
312 | list_del_init(&entry->root_extent_list); | ||
313 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | ||
314 | |||
315 | mutex_unlock(&tree->mutex); | ||
316 | wake_up(&entry->wait); | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * wait for all the ordered extents in a root. This is done when balancing | ||
322 | * space between drives. | ||
323 | */ | ||
324 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) | ||
325 | { | ||
326 | struct list_head splice; | ||
327 | struct list_head *cur; | ||
328 | struct btrfs_ordered_extent *ordered; | ||
329 | struct inode *inode; | ||
330 | |||
331 | INIT_LIST_HEAD(&splice); | ||
332 | |||
333 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
334 | list_splice_init(&root->fs_info->ordered_extents, &splice); | ||
335 | while (!list_empty(&splice)) { | ||
336 | cur = splice.next; | ||
337 | ordered = list_entry(cur, struct btrfs_ordered_extent, | ||
338 | root_extent_list); | ||
339 | if (nocow_only && | ||
340 | !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && | ||
341 | !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { | ||
342 | list_move(&ordered->root_extent_list, | ||
343 | &root->fs_info->ordered_extents); | ||
344 | cond_resched_lock(&root->fs_info->ordered_extent_lock); | ||
345 | continue; | ||
346 | } | ||
347 | |||
348 | list_del_init(&ordered->root_extent_list); | ||
349 | atomic_inc(&ordered->refs); | ||
350 | |||
351 | /* | ||
352 | * the inode may be getting freed (in sys_unlink path). | ||
353 | */ | ||
354 | inode = igrab(ordered->inode); | ||
355 | |||
356 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
357 | |||
358 | if (inode) { | ||
359 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
360 | btrfs_put_ordered_extent(ordered); | ||
361 | iput(inode); | ||
362 | } else { | ||
363 | btrfs_put_ordered_extent(ordered); | ||
364 | } | ||
365 | |||
366 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
367 | } | ||
368 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Used to start IO or wait for a given ordered extent to finish. | ||
374 | * | ||
375 | * If wait is one, this effectively waits on page writeback for all the pages | ||
376 | * in the extent, and it waits on the io completion code to insert | ||
377 | * metadata into the btree corresponding to the extent | ||
378 | */ | ||
379 | void btrfs_start_ordered_extent(struct inode *inode, | ||
380 | struct btrfs_ordered_extent *entry, | ||
381 | int wait) | ||
382 | { | ||
383 | u64 start = entry->file_offset; | ||
384 | u64 end = start + entry->len - 1; | ||
385 | |||
386 | /* | ||
387 | * pages in the range can be dirty, clean or writeback. We | ||
388 | * start IO on any dirty ones so the wait doesn't stall waiting | ||
389 | * for pdflush to find them | ||
390 | */ | ||
391 | btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); | ||
392 | if (wait) { | ||
393 | wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, | ||
394 | &entry->flags)); | ||
395 | } | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Used to wait on ordered extents across a large range of bytes. | ||
400 | */ | ||
401 | int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | ||
402 | { | ||
403 | u64 end; | ||
404 | u64 orig_end; | ||
405 | u64 wait_end; | ||
406 | struct btrfs_ordered_extent *ordered; | ||
407 | |||
408 | if (start + len < start) { | ||
409 | orig_end = INT_LIMIT(loff_t); | ||
410 | } else { | ||
411 | orig_end = start + len - 1; | ||
412 | if (orig_end > INT_LIMIT(loff_t)) | ||
413 | orig_end = INT_LIMIT(loff_t); | ||
414 | } | ||
415 | wait_end = orig_end; | ||
416 | again: | ||
417 | /* start IO across the range first to instantiate any delalloc | ||
418 | * extents | ||
419 | */ | ||
420 | btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); | ||
421 | |||
422 | /* The compression code will leave pages locked but return from | ||
423 | * writepage without setting the page writeback. Starting again | ||
424 | * with WB_SYNC_ALL will end up waiting for the IO to actually start. | ||
425 | */ | ||
426 | btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); | ||
427 | |||
428 | btrfs_wait_on_page_writeback_range(inode->i_mapping, | ||
429 | start >> PAGE_CACHE_SHIFT, | ||
430 | orig_end >> PAGE_CACHE_SHIFT); | ||
431 | |||
432 | end = orig_end; | ||
433 | while (1) { | ||
434 | ordered = btrfs_lookup_first_ordered_extent(inode, end); | ||
435 | if (!ordered) | ||
436 | break; | ||
437 | if (ordered->file_offset > orig_end) { | ||
438 | btrfs_put_ordered_extent(ordered); | ||
439 | break; | ||
440 | } | ||
441 | if (ordered->file_offset + ordered->len < start) { | ||
442 | btrfs_put_ordered_extent(ordered); | ||
443 | break; | ||
444 | } | ||
445 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
446 | end = ordered->file_offset; | ||
447 | btrfs_put_ordered_extent(ordered); | ||
448 | if (end == 0 || end == start) | ||
449 | break; | ||
450 | end--; | ||
451 | } | ||
452 | if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, | ||
453 | EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { | ||
454 | schedule_timeout(1); | ||
455 | goto again; | ||
456 | } | ||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * find an ordered extent corresponding to file_offset. return NULL if | ||
462 | * nothing is found, otherwise take a reference on the extent and return it | ||
463 | */ | ||
464 | struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, | ||
465 | u64 file_offset) | ||
466 | { | ||
467 | struct btrfs_ordered_inode_tree *tree; | ||
468 | struct rb_node *node; | ||
469 | struct btrfs_ordered_extent *entry = NULL; | ||
470 | |||
471 | tree = &BTRFS_I(inode)->ordered_tree; | ||
472 | mutex_lock(&tree->mutex); | ||
473 | node = tree_search(tree, file_offset); | ||
474 | if (!node) | ||
475 | goto out; | ||
476 | |||
477 | entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); | ||
478 | if (!offset_in_entry(entry, file_offset)) | ||
479 | entry = NULL; | ||
480 | if (entry) | ||
481 | atomic_inc(&entry->refs); | ||
482 | out: | ||
483 | mutex_unlock(&tree->mutex); | ||
484 | return entry; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * lookup and return any extent before 'file_offset'. NULL is returned | ||
489 | * if none is found | ||
490 | */ | ||
491 | struct btrfs_ordered_extent * | ||
492 | btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) | ||
493 | { | ||
494 | struct btrfs_ordered_inode_tree *tree; | ||
495 | struct rb_node *node; | ||
496 | struct btrfs_ordered_extent *entry = NULL; | ||
497 | |||
498 | tree = &BTRFS_I(inode)->ordered_tree; | ||
499 | mutex_lock(&tree->mutex); | ||
500 | node = tree_search(tree, file_offset); | ||
501 | if (!node) | ||
502 | goto out; | ||
503 | |||
504 | entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); | ||
505 | atomic_inc(&entry->refs); | ||
506 | out: | ||
507 | mutex_unlock(&tree->mutex); | ||
508 | return entry; | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * After an extent is done, call this to conditionally update the on disk | ||
513 | * i_size. i_size is updated to cover any fully written part of the file. | ||
514 | */ | ||
515 | int btrfs_ordered_update_i_size(struct inode *inode, | ||
516 | struct btrfs_ordered_extent *ordered) | ||
517 | { | ||
518 | struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; | ||
519 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | ||
520 | u64 disk_i_size; | ||
521 | u64 new_i_size; | ||
522 | u64 i_size_test; | ||
523 | struct rb_node *node; | ||
524 | struct btrfs_ordered_extent *test; | ||
525 | |||
526 | mutex_lock(&tree->mutex); | ||
527 | disk_i_size = BTRFS_I(inode)->disk_i_size; | ||
528 | |||
529 | /* | ||
530 | * if the disk i_size is already at the inode->i_size, or | ||
531 | * this ordered extent is inside the disk i_size, we're done | ||
532 | */ | ||
533 | if (disk_i_size >= inode->i_size || | ||
534 | ordered->file_offset + ordered->len <= disk_i_size) { | ||
535 | goto out; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * we can't update the disk_isize if there are delalloc bytes | ||
540 | * between disk_i_size and this ordered extent | ||
541 | */ | ||
542 | if (test_range_bit(io_tree, disk_i_size, | ||
543 | ordered->file_offset + ordered->len - 1, | ||
544 | EXTENT_DELALLOC, 0)) { | ||
545 | goto out; | ||
546 | } | ||
547 | /* | ||
548 | * walk backward from this ordered extent to disk_i_size. | ||
549 | * if we find an ordered extent then we can't update disk i_size | ||
550 | * yet | ||
551 | */ | ||
552 | node = &ordered->rb_node; | ||
553 | while (1) { | ||
554 | node = rb_prev(node); | ||
555 | if (!node) | ||
556 | break; | ||
557 | test = rb_entry(node, struct btrfs_ordered_extent, rb_node); | ||
558 | if (test->file_offset + test->len <= disk_i_size) | ||
559 | break; | ||
560 | if (test->file_offset >= inode->i_size) | ||
561 | break; | ||
562 | if (test->file_offset >= disk_i_size) | ||
563 | goto out; | ||
564 | } | ||
565 | new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); | ||
566 | |||
567 | /* | ||
568 | * at this point, we know we can safely update i_size to at least | ||
569 | * the offset from this ordered extent. But, we need to | ||
570 | * walk forward and see if ios from higher up in the file have | ||
571 | * finished. | ||
572 | */ | ||
573 | node = rb_next(&ordered->rb_node); | ||
574 | i_size_test = 0; | ||
575 | if (node) { | ||
576 | /* | ||
577 | * do we have an area where IO might have finished | ||
578 | * between our ordered extent and the next one. | ||
579 | */ | ||
580 | test = rb_entry(node, struct btrfs_ordered_extent, rb_node); | ||
581 | if (test->file_offset > entry_end(ordered)) | ||
582 | i_size_test = test->file_offset; | ||
583 | } else { | ||
584 | i_size_test = i_size_read(inode); | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * i_size_test is the end of a region after this ordered | ||
589 | * extent where there are no ordered extents. As long as there | ||
590 | * are no delalloc bytes in this area, it is safe to update | ||
591 | * disk_i_size to the end of the region. | ||
592 | */ | ||
593 | if (i_size_test > entry_end(ordered) && | ||
594 | !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, | ||
595 | EXTENT_DELALLOC, 0)) { | ||
596 | new_i_size = min_t(u64, i_size_test, i_size_read(inode)); | ||
597 | } | ||
598 | BTRFS_I(inode)->disk_i_size = new_i_size; | ||
599 | out: | ||
600 | mutex_unlock(&tree->mutex); | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * search the ordered extents for one corresponding to 'offset' and | ||
606 | * try to find a checksum. This is used because we allow pages to | ||
607 | * be reclaimed before their checksum is actually put into the btree | ||
608 | */ | ||
609 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, | ||
610 | u32 *sum) | ||
611 | { | ||
612 | struct btrfs_ordered_sum *ordered_sum; | ||
613 | struct btrfs_sector_sum *sector_sums; | ||
614 | struct btrfs_ordered_extent *ordered; | ||
615 | struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; | ||
616 | struct list_head *cur; | ||
617 | unsigned long num_sectors; | ||
618 | unsigned long i; | ||
619 | u32 sectorsize = BTRFS_I(inode)->root->sectorsize; | ||
620 | int ret = 1; | ||
621 | |||
622 | ordered = btrfs_lookup_ordered_extent(inode, offset); | ||
623 | if (!ordered) | ||
624 | return 1; | ||
625 | |||
626 | mutex_lock(&tree->mutex); | ||
627 | list_for_each_prev(cur, &ordered->list) { | ||
628 | ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); | ||
629 | if (disk_bytenr >= ordered_sum->bytenr) { | ||
630 | num_sectors = ordered_sum->len / sectorsize; | ||
631 | sector_sums = ordered_sum->sums; | ||
632 | for (i = 0; i < num_sectors; i++) { | ||
633 | if (sector_sums[i].bytenr == disk_bytenr) { | ||
634 | *sum = sector_sums[i].sum; | ||
635 | ret = 0; | ||
636 | goto out; | ||
637 | } | ||
638 | } | ||
639 | } | ||
640 | } | ||
641 | out: | ||
642 | mutex_unlock(&tree->mutex); | ||
643 | btrfs_put_ordered_extent(ordered); | ||
644 | return ret; | ||
645 | } | ||
646 | |||
647 | |||
648 | /** | ||
649 | * taken from mm/filemap.c because it isn't exported | ||
650 | * | ||
651 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range | ||
652 | * @mapping: address space structure to write | ||
653 | * @start: offset in bytes where the range starts | ||
654 | * @end: offset in bytes where the range ends (inclusive) | ||
655 | * @sync_mode: enable synchronous operation | ||
656 | * | ||
657 | * Start writeback against all of a mapping's dirty pages that lie | ||
658 | * within the byte offsets <start, end> inclusive. | ||
659 | * | ||
660 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | ||
661 | * opposed to a regular memory cleansing writeback. The difference between | ||
662 | * these two operations is that if a dirty page/buffer is encountered, it must | ||
663 | * be waited upon, and not just skipped over. | ||
664 | */ | ||
665 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, | ||
666 | loff_t end, int sync_mode) | ||
667 | { | ||
668 | struct writeback_control wbc = { | ||
669 | .sync_mode = sync_mode, | ||
670 | .nr_to_write = mapping->nrpages * 2, | ||
671 | .range_start = start, | ||
672 | .range_end = end, | ||
673 | .for_writepages = 1, | ||
674 | }; | ||
675 | return btrfs_writepages(mapping, &wbc); | ||
676 | } | ||
677 | |||
678 | /** | ||
679 | * taken from mm/filemap.c because it isn't exported | ||
680 | * | ||
681 | * wait_on_page_writeback_range - wait for writeback to complete | ||
682 | * @mapping: target address_space | ||
683 | * @start: beginning page index | ||
684 | * @end: ending page index | ||
685 | * | ||
686 | * Wait for writeback to complete against pages indexed by start->end | ||
687 | * inclusive | ||
688 | */ | ||
689 | int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | ||
690 | pgoff_t start, pgoff_t end) | ||
691 | { | ||
692 | struct pagevec pvec; | ||
693 | int nr_pages; | ||
694 | int ret = 0; | ||
695 | pgoff_t index; | ||
696 | |||
697 | if (end < start) | ||
698 | return 0; | ||
699 | |||
700 | pagevec_init(&pvec, 0); | ||
701 | index = start; | ||
702 | while ((index <= end) && | ||
703 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
704 | PAGECACHE_TAG_WRITEBACK, | ||
705 | min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { | ||
706 | unsigned i; | ||
707 | |||
708 | for (i = 0; i < nr_pages; i++) { | ||
709 | struct page *page = pvec.pages[i]; | ||
710 | |||
711 | /* until radix tree lookup accepts end_index */ | ||
712 | if (page->index > end) | ||
713 | continue; | ||
714 | |||
715 | wait_on_page_writeback(page); | ||
716 | if (PageError(page)) | ||
717 | ret = -EIO; | ||
718 | } | ||
719 | pagevec_release(&pvec); | ||
720 | cond_resched(); | ||
721 | } | ||
722 | |||
723 | /* Check for outstanding write errors */ | ||
724 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
725 | ret = -ENOSPC; | ||
726 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
727 | ret = -EIO; | ||
728 | |||
729 | return ret; | ||
730 | } | ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 000000000000..ab66d5e8d6d6 --- /dev/null +++ b/fs/btrfs/ordered-data.h | |||
@@ -0,0 +1,158 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_ORDERED_DATA__ | ||
20 | #define __BTRFS_ORDERED_DATA__ | ||
21 | |||
22 | /* one of these per inode */ | ||
23 | struct btrfs_ordered_inode_tree { | ||
24 | struct mutex mutex; | ||
25 | struct rb_root tree; | ||
26 | struct rb_node *last; | ||
27 | }; | ||
28 | |||
29 | /* | ||
30 | * these are used to collect checksums done just before bios submission. | ||
31 | * They are attached via a list into the ordered extent, and | ||
32 | * checksum items are inserted into the tree after all the blocks in | ||
33 | * the ordered extent are on disk | ||
34 | */ | ||
35 | struct btrfs_sector_sum { | ||
36 | /* bytenr on disk */ | ||
37 | u64 bytenr; | ||
38 | u32 sum; | ||
39 | }; | ||
40 | |||
41 | struct btrfs_ordered_sum { | ||
42 | /* bytenr is the start of this extent on disk */ | ||
43 | u64 bytenr; | ||
44 | |||
45 | /* | ||
46 | * this is the length in bytes covered by the sums array below. | ||
47 | */ | ||
48 | unsigned long len; | ||
49 | struct list_head list; | ||
50 | /* last field is a variable length array of btrfs_sector_sums */ | ||
51 | struct btrfs_sector_sum sums[]; | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * bits for the flags field: | ||
56 | * | ||
57 | * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. | ||
58 | * It is used to make sure metadata is inserted into the tree only once | ||
59 | * per extent. | ||
60 | * | ||
61 | * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the | ||
62 | * rbtree, just before waking any waiters. It is used to indicate the | ||
63 | * IO is done and any metadata is inserted into the tree. | ||
64 | */ | ||
65 | #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ | ||
66 | |||
67 | #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ | ||
68 | |||
69 | #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ | ||
70 | |||
71 | #define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ | ||
72 | |||
73 | #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ | ||
74 | |||
75 | struct btrfs_ordered_extent { | ||
76 | /* logical offset in the file */ | ||
77 | u64 file_offset; | ||
78 | |||
79 | /* disk byte number */ | ||
80 | u64 start; | ||
81 | |||
82 | /* ram length of the extent in bytes */ | ||
83 | u64 len; | ||
84 | |||
85 | /* extent length on disk */ | ||
86 | u64 disk_len; | ||
87 | |||
88 | /* flags (described above) */ | ||
89 | unsigned long flags; | ||
90 | |||
91 | /* reference count */ | ||
92 | atomic_t refs; | ||
93 | |||
94 | /* the inode we belong to */ | ||
95 | struct inode *inode; | ||
96 | |||
97 | /* list of checksums for insertion when the extent io is done */ | ||
98 | struct list_head list; | ||
99 | |||
100 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ | ||
101 | wait_queue_head_t wait; | ||
102 | |||
103 | /* our friendly rbtree entry */ | ||
104 | struct rb_node rb_node; | ||
105 | |||
106 | /* a per root list of all the pending ordered extents */ | ||
107 | struct list_head root_extent_list; | ||
108 | }; | ||
109 | |||
110 | |||
111 | /* | ||
112 | * calculates the total size you need to allocate for an ordered sum | ||
113 | * structure spanning 'bytes' in the file | ||
114 | */ | ||
115 | static inline int btrfs_ordered_sum_size(struct btrfs_root *root, | ||
116 | unsigned long bytes) | ||
117 | { | ||
118 | unsigned long num_sectors = (bytes + root->sectorsize - 1) / | ||
119 | root->sectorsize; | ||
120 | num_sectors++; | ||
121 | return sizeof(struct btrfs_ordered_sum) + | ||
122 | num_sectors * sizeof(struct btrfs_sector_sum); | ||
123 | } | ||
124 | |||
125 | static inline void | ||
126 | btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) | ||
127 | { | ||
128 | mutex_init(&t->mutex); | ||
129 | t->tree.rb_node = NULL; | ||
130 | t->last = NULL; | ||
131 | } | ||
132 | |||
133 | int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); | ||
134 | int btrfs_remove_ordered_extent(struct inode *inode, | ||
135 | struct btrfs_ordered_extent *entry); | ||
136 | int btrfs_dec_test_ordered_pending(struct inode *inode, | ||
137 | u64 file_offset, u64 io_size); | ||
138 | int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | ||
139 | u64 start, u64 len, u64 disk_len, int tyep); | ||
140 | int btrfs_add_ordered_sum(struct inode *inode, | ||
141 | struct btrfs_ordered_extent *entry, | ||
142 | struct btrfs_ordered_sum *sum); | ||
143 | struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, | ||
144 | u64 file_offset); | ||
145 | void btrfs_start_ordered_extent(struct inode *inode, | ||
146 | struct btrfs_ordered_extent *entry, int wait); | ||
147 | int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); | ||
148 | struct btrfs_ordered_extent * | ||
149 | btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); | ||
150 | int btrfs_ordered_update_i_size(struct inode *inode, | ||
151 | struct btrfs_ordered_extent *ordered); | ||
152 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); | ||
153 | int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | ||
154 | pgoff_t start, pgoff_t end); | ||
155 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, | ||
156 | loff_t end, int sync_mode); | ||
157 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); | ||
158 | #endif | ||
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 000000000000..3c0d52af4f80 --- /dev/null +++ b/fs/btrfs/orphan.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | |||
22 | int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, | ||
23 | struct btrfs_root *root, u64 offset) | ||
24 | { | ||
25 | struct btrfs_path *path; | ||
26 | struct btrfs_key key; | ||
27 | int ret = 0; | ||
28 | |||
29 | key.objectid = BTRFS_ORPHAN_OBJECTID; | ||
30 | btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); | ||
31 | key.offset = offset; | ||
32 | |||
33 | path = btrfs_alloc_path(); | ||
34 | if (!path) | ||
35 | return -ENOMEM; | ||
36 | |||
37 | ret = btrfs_insert_empty_item(trans, root, path, &key, 0); | ||
38 | |||
39 | btrfs_free_path(path); | ||
40 | return ret; | ||
41 | } | ||
42 | |||
43 | int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, | ||
44 | struct btrfs_root *root, u64 offset) | ||
45 | { | ||
46 | struct btrfs_path *path; | ||
47 | struct btrfs_key key; | ||
48 | int ret = 0; | ||
49 | |||
50 | key.objectid = BTRFS_ORPHAN_OBJECTID; | ||
51 | btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); | ||
52 | key.offset = offset; | ||
53 | |||
54 | path = btrfs_alloc_path(); | ||
55 | if (!path) | ||
56 | return -ENOMEM; | ||
57 | |||
58 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
59 | if (ret) | ||
60 | goto out; | ||
61 | |||
62 | ret = btrfs_del_item(trans, root, path); | ||
63 | |||
64 | out: | ||
65 | btrfs_free_path(path); | ||
66 | return ret; | ||
67 | } | ||
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 000000000000..5f8f218c1005 --- /dev/null +++ b/fs/btrfs/print-tree.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | #include "print-tree.h" | ||
22 | |||
23 | static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) | ||
24 | { | ||
25 | int num_stripes = btrfs_chunk_num_stripes(eb, chunk); | ||
26 | int i; | ||
27 | printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " | ||
28 | "num_stripes %d\n", | ||
29 | (unsigned long long)btrfs_chunk_length(eb, chunk), | ||
30 | (unsigned long long)btrfs_chunk_owner(eb, chunk), | ||
31 | (unsigned long long)btrfs_chunk_type(eb, chunk), | ||
32 | num_stripes); | ||
33 | for (i = 0 ; i < num_stripes ; i++) { | ||
34 | printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, | ||
35 | (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), | ||
36 | (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); | ||
37 | } | ||
38 | } | ||
39 | static void print_dev_item(struct extent_buffer *eb, | ||
40 | struct btrfs_dev_item *dev_item) | ||
41 | { | ||
42 | printk(KERN_INFO "\t\tdev item devid %llu " | ||
43 | "total_bytes %llu bytes used %llu\n", | ||
44 | (unsigned long long)btrfs_device_id(eb, dev_item), | ||
45 | (unsigned long long)btrfs_device_total_bytes(eb, dev_item), | ||
46 | (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); | ||
47 | } | ||
48 | void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | ||
49 | { | ||
50 | int i; | ||
51 | u32 nr = btrfs_header_nritems(l); | ||
52 | struct btrfs_item *item; | ||
53 | struct btrfs_extent_item *ei; | ||
54 | struct btrfs_root_item *ri; | ||
55 | struct btrfs_dir_item *di; | ||
56 | struct btrfs_inode_item *ii; | ||
57 | struct btrfs_block_group_item *bi; | ||
58 | struct btrfs_file_extent_item *fi; | ||
59 | struct btrfs_key key; | ||
60 | struct btrfs_key found_key; | ||
61 | struct btrfs_extent_ref *ref; | ||
62 | struct btrfs_dev_extent *dev_extent; | ||
63 | u32 type; | ||
64 | |||
65 | printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", | ||
66 | (unsigned long long)btrfs_header_bytenr(l), nr, | ||
67 | btrfs_leaf_free_space(root, l)); | ||
68 | for (i = 0 ; i < nr ; i++) { | ||
69 | item = btrfs_item_nr(l, i); | ||
70 | btrfs_item_key_to_cpu(l, &key, i); | ||
71 | type = btrfs_key_type(&key); | ||
72 | printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " | ||
73 | "itemsize %d\n", | ||
74 | i, | ||
75 | (unsigned long long)key.objectid, type, | ||
76 | (unsigned long long)key.offset, | ||
77 | btrfs_item_offset(l, item), btrfs_item_size(l, item)); | ||
78 | switch (type) { | ||
79 | case BTRFS_INODE_ITEM_KEY: | ||
80 | ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); | ||
81 | printk(KERN_INFO "\t\tinode generation %llu size %llu " | ||
82 | "mode %o\n", | ||
83 | (unsigned long long) | ||
84 | btrfs_inode_generation(l, ii), | ||
85 | (unsigned long long)btrfs_inode_size(l, ii), | ||
86 | btrfs_inode_mode(l, ii)); | ||
87 | break; | ||
88 | case BTRFS_DIR_ITEM_KEY: | ||
89 | di = btrfs_item_ptr(l, i, struct btrfs_dir_item); | ||
90 | btrfs_dir_item_key_to_cpu(l, di, &found_key); | ||
91 | printk(KERN_INFO "\t\tdir oid %llu type %u\n", | ||
92 | (unsigned long long)found_key.objectid, | ||
93 | btrfs_dir_type(l, di)); | ||
94 | break; | ||
95 | case BTRFS_ROOT_ITEM_KEY: | ||
96 | ri = btrfs_item_ptr(l, i, struct btrfs_root_item); | ||
97 | printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", | ||
98 | (unsigned long long) | ||
99 | btrfs_disk_root_bytenr(l, ri), | ||
100 | btrfs_disk_root_refs(l, ri)); | ||
101 | break; | ||
102 | case BTRFS_EXTENT_ITEM_KEY: | ||
103 | ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); | ||
104 | printk(KERN_INFO "\t\textent data refs %u\n", | ||
105 | btrfs_extent_refs(l, ei)); | ||
106 | break; | ||
107 | case BTRFS_EXTENT_REF_KEY: | ||
108 | ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); | ||
109 | printk(KERN_INFO "\t\textent back ref root %llu " | ||
110 | "gen %llu owner %llu num_refs %lu\n", | ||
111 | (unsigned long long)btrfs_ref_root(l, ref), | ||
112 | (unsigned long long)btrfs_ref_generation(l, ref), | ||
113 | (unsigned long long)btrfs_ref_objectid(l, ref), | ||
114 | (unsigned long)btrfs_ref_num_refs(l, ref)); | ||
115 | break; | ||
116 | |||
117 | case BTRFS_EXTENT_DATA_KEY: | ||
118 | fi = btrfs_item_ptr(l, i, | ||
119 | struct btrfs_file_extent_item); | ||
120 | if (btrfs_file_extent_type(l, fi) == | ||
121 | BTRFS_FILE_EXTENT_INLINE) { | ||
122 | printk(KERN_INFO "\t\tinline extent data " | ||
123 | "size %u\n", | ||
124 | btrfs_file_extent_inline_len(l, fi)); | ||
125 | break; | ||
126 | } | ||
127 | printk(KERN_INFO "\t\textent data disk bytenr %llu " | ||
128 | "nr %llu\n", | ||
129 | (unsigned long long) | ||
130 | btrfs_file_extent_disk_bytenr(l, fi), | ||
131 | (unsigned long long) | ||
132 | btrfs_file_extent_disk_num_bytes(l, fi)); | ||
133 | printk(KERN_INFO "\t\textent data offset %llu " | ||
134 | "nr %llu ram %llu\n", | ||
135 | (unsigned long long) | ||
136 | btrfs_file_extent_offset(l, fi), | ||
137 | (unsigned long long) | ||
138 | btrfs_file_extent_num_bytes(l, fi), | ||
139 | (unsigned long long) | ||
140 | btrfs_file_extent_ram_bytes(l, fi)); | ||
141 | break; | ||
142 | case BTRFS_BLOCK_GROUP_ITEM_KEY: | ||
143 | bi = btrfs_item_ptr(l, i, | ||
144 | struct btrfs_block_group_item); | ||
145 | printk(KERN_INFO "\t\tblock group used %llu\n", | ||
146 | (unsigned long long) | ||
147 | btrfs_disk_block_group_used(l, bi)); | ||
148 | break; | ||
149 | case BTRFS_CHUNK_ITEM_KEY: | ||
150 | print_chunk(l, btrfs_item_ptr(l, i, | ||
151 | struct btrfs_chunk)); | ||
152 | break; | ||
153 | case BTRFS_DEV_ITEM_KEY: | ||
154 | print_dev_item(l, btrfs_item_ptr(l, i, | ||
155 | struct btrfs_dev_item)); | ||
156 | break; | ||
157 | case BTRFS_DEV_EXTENT_KEY: | ||
158 | dev_extent = btrfs_item_ptr(l, i, | ||
159 | struct btrfs_dev_extent); | ||
160 | printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" | ||
161 | "\t\tchunk objectid %llu chunk offset %llu " | ||
162 | "length %llu\n", | ||
163 | (unsigned long long) | ||
164 | btrfs_dev_extent_chunk_tree(l, dev_extent), | ||
165 | (unsigned long long) | ||
166 | btrfs_dev_extent_chunk_objectid(l, dev_extent), | ||
167 | (unsigned long long) | ||
168 | btrfs_dev_extent_chunk_offset(l, dev_extent), | ||
169 | (unsigned long long) | ||
170 | btrfs_dev_extent_length(l, dev_extent)); | ||
171 | }; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) | ||
176 | { | ||
177 | int i; u32 nr; | ||
178 | struct btrfs_key key; | ||
179 | int level; | ||
180 | |||
181 | if (!c) | ||
182 | return; | ||
183 | nr = btrfs_header_nritems(c); | ||
184 | level = btrfs_header_level(c); | ||
185 | if (level == 0) { | ||
186 | btrfs_print_leaf(root, c); | ||
187 | return; | ||
188 | } | ||
189 | printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", | ||
190 | (unsigned long long)btrfs_header_bytenr(c), | ||
191 | btrfs_header_level(c), nr, | ||
192 | (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); | ||
193 | for (i = 0; i < nr; i++) { | ||
194 | btrfs_node_key_to_cpu(c, &key, i); | ||
195 | printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", | ||
196 | i, | ||
197 | (unsigned long long)key.objectid, | ||
198 | key.type, | ||
199 | (unsigned long long)key.offset, | ||
200 | (unsigned long long)btrfs_node_blockptr(c, i)); | ||
201 | } | ||
202 | for (i = 0; i < nr; i++) { | ||
203 | struct extent_buffer *next = read_tree_block(root, | ||
204 | btrfs_node_blockptr(c, i), | ||
205 | btrfs_level_size(root, level - 1), | ||
206 | btrfs_node_ptr_generation(c, i)); | ||
207 | if (btrfs_is_leaf(next) && | ||
208 | btrfs_header_level(c) != 1) | ||
209 | BUG(); | ||
210 | if (btrfs_header_level(next) != | ||
211 | btrfs_header_level(c) - 1) | ||
212 | BUG(); | ||
213 | btrfs_print_tree(root, next); | ||
214 | free_extent_buffer(next); | ||
215 | } | ||
216 | } | ||
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 000000000000..da75efe534d5 --- /dev/null +++ b/fs/btrfs/print-tree.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __PRINT_TREE_ | ||
20 | #define __PRINT_TREE_ | ||
21 | void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); | ||
22 | void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); | ||
23 | #endif | ||
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 000000000000..6f0acc4c9eab --- /dev/null +++ b/fs/btrfs/ref-cache.c | |||
@@ -0,0 +1,230 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include "ctree.h" | ||
21 | #include "ref-cache.h" | ||
22 | #include "transaction.h" | ||
23 | |||
24 | /* | ||
25 | * leaf refs are used to cache the information about which extents | ||
26 | * a given leaf has references on. This allows us to process that leaf | ||
27 | * in btrfs_drop_snapshot without needing to read it back from disk. | ||
28 | */ | ||
29 | |||
30 | /* | ||
31 | * kmalloc a leaf reference struct and update the counters for the | ||
32 | * total ref cache size | ||
33 | */ | ||
34 | struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, | ||
35 | int nr_extents) | ||
36 | { | ||
37 | struct btrfs_leaf_ref *ref; | ||
38 | size_t size = btrfs_leaf_ref_size(nr_extents); | ||
39 | |||
40 | ref = kmalloc(size, GFP_NOFS); | ||
41 | if (ref) { | ||
42 | spin_lock(&root->fs_info->ref_cache_lock); | ||
43 | root->fs_info->total_ref_cache_size += size; | ||
44 | spin_unlock(&root->fs_info->ref_cache_lock); | ||
45 | |||
46 | memset(ref, 0, sizeof(*ref)); | ||
47 | atomic_set(&ref->usage, 1); | ||
48 | INIT_LIST_HEAD(&ref->list); | ||
49 | } | ||
50 | return ref; | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * free a leaf reference struct and update the counters for the | ||
55 | * total ref cache size | ||
56 | */ | ||
57 | void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) | ||
58 | { | ||
59 | if (!ref) | ||
60 | return; | ||
61 | WARN_ON(atomic_read(&ref->usage) == 0); | ||
62 | if (atomic_dec_and_test(&ref->usage)) { | ||
63 | size_t size = btrfs_leaf_ref_size(ref->nritems); | ||
64 | |||
65 | BUG_ON(ref->in_tree); | ||
66 | kfree(ref); | ||
67 | |||
68 | spin_lock(&root->fs_info->ref_cache_lock); | ||
69 | root->fs_info->total_ref_cache_size -= size; | ||
70 | spin_unlock(&root->fs_info->ref_cache_lock); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, | ||
75 | struct rb_node *node) | ||
76 | { | ||
77 | struct rb_node **p = &root->rb_node; | ||
78 | struct rb_node *parent = NULL; | ||
79 | struct btrfs_leaf_ref *entry; | ||
80 | |||
81 | while (*p) { | ||
82 | parent = *p; | ||
83 | entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); | ||
84 | |||
85 | if (bytenr < entry->bytenr) | ||
86 | p = &(*p)->rb_left; | ||
87 | else if (bytenr > entry->bytenr) | ||
88 | p = &(*p)->rb_right; | ||
89 | else | ||
90 | return parent; | ||
91 | } | ||
92 | |||
93 | entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); | ||
94 | rb_link_node(node, parent, p); | ||
95 | rb_insert_color(node, root); | ||
96 | return NULL; | ||
97 | } | ||
98 | |||
99 | static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) | ||
100 | { | ||
101 | struct rb_node *n = root->rb_node; | ||
102 | struct btrfs_leaf_ref *entry; | ||
103 | |||
104 | while (n) { | ||
105 | entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); | ||
106 | WARN_ON(!entry->in_tree); | ||
107 | |||
108 | if (bytenr < entry->bytenr) | ||
109 | n = n->rb_left; | ||
110 | else if (bytenr > entry->bytenr) | ||
111 | n = n->rb_right; | ||
112 | else | ||
113 | return n; | ||
114 | } | ||
115 | return NULL; | ||
116 | } | ||
117 | |||
118 | int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, | ||
119 | int shared) | ||
120 | { | ||
121 | struct btrfs_leaf_ref *ref = NULL; | ||
122 | struct btrfs_leaf_ref_tree *tree = root->ref_tree; | ||
123 | |||
124 | if (shared) | ||
125 | tree = &root->fs_info->shared_ref_tree; | ||
126 | if (!tree) | ||
127 | return 0; | ||
128 | |||
129 | spin_lock(&tree->lock); | ||
130 | while (!list_empty(&tree->list)) { | ||
131 | ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); | ||
132 | BUG_ON(ref->tree != tree); | ||
133 | if (ref->root_gen > max_root_gen) | ||
134 | break; | ||
135 | if (!xchg(&ref->in_tree, 0)) { | ||
136 | cond_resched_lock(&tree->lock); | ||
137 | continue; | ||
138 | } | ||
139 | |||
140 | rb_erase(&ref->rb_node, &tree->root); | ||
141 | list_del_init(&ref->list); | ||
142 | |||
143 | spin_unlock(&tree->lock); | ||
144 | btrfs_free_leaf_ref(root, ref); | ||
145 | cond_resched(); | ||
146 | spin_lock(&tree->lock); | ||
147 | } | ||
148 | spin_unlock(&tree->lock); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * find the leaf ref for a given extent. This returns the ref struct with | ||
154 | * a usage reference incremented | ||
155 | */ | ||
156 | struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, | ||
157 | u64 bytenr) | ||
158 | { | ||
159 | struct rb_node *rb; | ||
160 | struct btrfs_leaf_ref *ref = NULL; | ||
161 | struct btrfs_leaf_ref_tree *tree = root->ref_tree; | ||
162 | again: | ||
163 | if (tree) { | ||
164 | spin_lock(&tree->lock); | ||
165 | rb = tree_search(&tree->root, bytenr); | ||
166 | if (rb) | ||
167 | ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); | ||
168 | if (ref) | ||
169 | atomic_inc(&ref->usage); | ||
170 | spin_unlock(&tree->lock); | ||
171 | if (ref) | ||
172 | return ref; | ||
173 | } | ||
174 | if (tree != &root->fs_info->shared_ref_tree) { | ||
175 | tree = &root->fs_info->shared_ref_tree; | ||
176 | goto again; | ||
177 | } | ||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * add a fully filled in leaf ref struct | ||
183 | * remove all the refs older than a given root generation | ||
184 | */ | ||
185 | int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, | ||
186 | int shared) | ||
187 | { | ||
188 | int ret = 0; | ||
189 | struct rb_node *rb; | ||
190 | struct btrfs_leaf_ref_tree *tree = root->ref_tree; | ||
191 | |||
192 | if (shared) | ||
193 | tree = &root->fs_info->shared_ref_tree; | ||
194 | |||
195 | spin_lock(&tree->lock); | ||
196 | rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); | ||
197 | if (rb) { | ||
198 | ret = -EEXIST; | ||
199 | } else { | ||
200 | atomic_inc(&ref->usage); | ||
201 | ref->tree = tree; | ||
202 | ref->in_tree = 1; | ||
203 | list_add_tail(&ref->list, &tree->list); | ||
204 | } | ||
205 | spin_unlock(&tree->lock); | ||
206 | return ret; | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * remove a single leaf ref from the tree. This drops the ref held by the tree | ||
211 | * only | ||
212 | */ | ||
213 | int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) | ||
214 | { | ||
215 | struct btrfs_leaf_ref_tree *tree; | ||
216 | |||
217 | if (!xchg(&ref->in_tree, 0)) | ||
218 | return 0; | ||
219 | |||
220 | tree = ref->tree; | ||
221 | spin_lock(&tree->lock); | ||
222 | |||
223 | rb_erase(&ref->rb_node, &tree->root); | ||
224 | list_del_init(&ref->list); | ||
225 | |||
226 | spin_unlock(&tree->lock); | ||
227 | |||
228 | btrfs_free_leaf_ref(root, ref); | ||
229 | return 0; | ||
230 | } | ||
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 000000000000..16f3183d7c59 --- /dev/null +++ b/fs/btrfs/ref-cache.h | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #ifndef __REFCACHE__ | ||
19 | #define __REFCACHE__ | ||
20 | |||
21 | struct btrfs_extent_info { | ||
22 | /* bytenr and num_bytes find the extent in the extent allocation tree */ | ||
23 | u64 bytenr; | ||
24 | u64 num_bytes; | ||
25 | |||
26 | /* objectid and offset find the back reference for the file */ | ||
27 | u64 objectid; | ||
28 | u64 offset; | ||
29 | }; | ||
30 | |||
31 | struct btrfs_leaf_ref { | ||
32 | struct rb_node rb_node; | ||
33 | struct btrfs_leaf_ref_tree *tree; | ||
34 | int in_tree; | ||
35 | atomic_t usage; | ||
36 | |||
37 | u64 root_gen; | ||
38 | u64 bytenr; | ||
39 | u64 owner; | ||
40 | u64 generation; | ||
41 | int nritems; | ||
42 | |||
43 | struct list_head list; | ||
44 | struct btrfs_extent_info extents[]; | ||
45 | }; | ||
46 | |||
47 | static inline size_t btrfs_leaf_ref_size(int nr_extents) | ||
48 | { | ||
49 | return sizeof(struct btrfs_leaf_ref) + | ||
50 | sizeof(struct btrfs_extent_info) * nr_extents; | ||
51 | } | ||
52 | |||
53 | static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) | ||
54 | { | ||
55 | tree->root.rb_node = NULL; | ||
56 | INIT_LIST_HEAD(&tree->list); | ||
57 | spin_lock_init(&tree->lock); | ||
58 | } | ||
59 | |||
60 | static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) | ||
61 | { | ||
62 | return RB_EMPTY_ROOT(&tree->root); | ||
63 | } | ||
64 | |||
65 | void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); | ||
66 | struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, | ||
67 | int nr_extents); | ||
68 | void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); | ||
69 | struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, | ||
70 | u64 bytenr); | ||
71 | int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, | ||
72 | int shared); | ||
73 | int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, | ||
74 | int shared); | ||
75 | int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); | ||
76 | |||
77 | #endif | ||
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 000000000000..b48650de4472 --- /dev/null +++ b/fs/btrfs/root-tree.c | |||
@@ -0,0 +1,366 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "transaction.h" | ||
21 | #include "disk-io.h" | ||
22 | #include "print-tree.h" | ||
23 | |||
24 | /* | ||
25 | * search forward for a root, starting with objectid 'search_start' | ||
26 | * if a root key is found, the objectid we find is filled into 'found_objectid' | ||
27 | * and 0 is returned. < 0 is returned on error, 1 if there is nothing | ||
28 | * left in the tree. | ||
29 | */ | ||
30 | int btrfs_search_root(struct btrfs_root *root, u64 search_start, | ||
31 | u64 *found_objectid) | ||
32 | { | ||
33 | struct btrfs_path *path; | ||
34 | struct btrfs_key search_key; | ||
35 | int ret; | ||
36 | |||
37 | root = root->fs_info->tree_root; | ||
38 | search_key.objectid = search_start; | ||
39 | search_key.type = (u8)-1; | ||
40 | search_key.offset = (u64)-1; | ||
41 | |||
42 | path = btrfs_alloc_path(); | ||
43 | BUG_ON(!path); | ||
44 | again: | ||
45 | ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); | ||
46 | if (ret < 0) | ||
47 | goto out; | ||
48 | if (ret == 0) { | ||
49 | ret = 1; | ||
50 | goto out; | ||
51 | } | ||
52 | if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { | ||
53 | ret = btrfs_next_leaf(root, path); | ||
54 | if (ret) | ||
55 | goto out; | ||
56 | } | ||
57 | btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); | ||
58 | if (search_key.type != BTRFS_ROOT_ITEM_KEY) { | ||
59 | search_key.offset++; | ||
60 | btrfs_release_path(root, path); | ||
61 | goto again; | ||
62 | } | ||
63 | ret = 0; | ||
64 | *found_objectid = search_key.objectid; | ||
65 | |||
66 | out: | ||
67 | btrfs_free_path(path); | ||
68 | return ret; | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * lookup the root with the highest offset for a given objectid. The key we do | ||
73 | * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 | ||
74 | * on error. | ||
75 | */ | ||
76 | int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, | ||
77 | struct btrfs_root_item *item, struct btrfs_key *key) | ||
78 | { | ||
79 | struct btrfs_path *path; | ||
80 | struct btrfs_key search_key; | ||
81 | struct btrfs_key found_key; | ||
82 | struct extent_buffer *l; | ||
83 | int ret; | ||
84 | int slot; | ||
85 | |||
86 | search_key.objectid = objectid; | ||
87 | search_key.type = BTRFS_ROOT_ITEM_KEY; | ||
88 | search_key.offset = (u64)-1; | ||
89 | |||
90 | path = btrfs_alloc_path(); | ||
91 | BUG_ON(!path); | ||
92 | ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); | ||
93 | if (ret < 0) | ||
94 | goto out; | ||
95 | |||
96 | BUG_ON(ret == 0); | ||
97 | l = path->nodes[0]; | ||
98 | BUG_ON(path->slots[0] == 0); | ||
99 | slot = path->slots[0] - 1; | ||
100 | btrfs_item_key_to_cpu(l, &found_key, slot); | ||
101 | if (found_key.objectid != objectid) { | ||
102 | ret = 1; | ||
103 | goto out; | ||
104 | } | ||
105 | read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), | ||
106 | sizeof(*item)); | ||
107 | memcpy(key, &found_key, sizeof(found_key)); | ||
108 | ret = 0; | ||
109 | out: | ||
110 | btrfs_free_path(path); | ||
111 | return ret; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * copy the data in 'item' into the btree | ||
116 | */ | ||
117 | int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root | ||
118 | *root, struct btrfs_key *key, struct btrfs_root_item | ||
119 | *item) | ||
120 | { | ||
121 | struct btrfs_path *path; | ||
122 | struct extent_buffer *l; | ||
123 | int ret; | ||
124 | int slot; | ||
125 | unsigned long ptr; | ||
126 | |||
127 | path = btrfs_alloc_path(); | ||
128 | BUG_ON(!path); | ||
129 | ret = btrfs_search_slot(trans, root, key, path, 0, 1); | ||
130 | if (ret < 0) | ||
131 | goto out; | ||
132 | |||
133 | if (ret != 0) { | ||
134 | btrfs_print_leaf(root, path->nodes[0]); | ||
135 | printk(KERN_CRIT "unable to update root key %llu %u %llu\n", | ||
136 | (unsigned long long)key->objectid, key->type, | ||
137 | (unsigned long long)key->offset); | ||
138 | BUG_ON(1); | ||
139 | } | ||
140 | |||
141 | l = path->nodes[0]; | ||
142 | slot = path->slots[0]; | ||
143 | ptr = btrfs_item_ptr_offset(l, slot); | ||
144 | write_extent_buffer(l, item, ptr, sizeof(*item)); | ||
145 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
146 | out: | ||
147 | btrfs_release_path(root, path); | ||
148 | btrfs_free_path(path); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root | ||
153 | *root, struct btrfs_key *key, struct btrfs_root_item | ||
154 | *item) | ||
155 | { | ||
156 | int ret; | ||
157 | ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); | ||
158 | return ret; | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * at mount time we want to find all the old transaction snapshots that were in | ||
163 | * the process of being deleted if we crashed. This is any root item with an | ||
164 | * offset lower than the latest root. They need to be queued for deletion to | ||
165 | * finish what was happening when we crashed. | ||
166 | */ | ||
167 | int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, | ||
168 | struct btrfs_root *latest) | ||
169 | { | ||
170 | struct btrfs_root *dead_root; | ||
171 | struct btrfs_item *item; | ||
172 | struct btrfs_root_item *ri; | ||
173 | struct btrfs_key key; | ||
174 | struct btrfs_key found_key; | ||
175 | struct btrfs_path *path; | ||
176 | int ret; | ||
177 | u32 nritems; | ||
178 | struct extent_buffer *leaf; | ||
179 | int slot; | ||
180 | |||
181 | key.objectid = objectid; | ||
182 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
183 | key.offset = 0; | ||
184 | path = btrfs_alloc_path(); | ||
185 | if (!path) | ||
186 | return -ENOMEM; | ||
187 | |||
188 | again: | ||
189 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
190 | if (ret < 0) | ||
191 | goto err; | ||
192 | while (1) { | ||
193 | leaf = path->nodes[0]; | ||
194 | nritems = btrfs_header_nritems(leaf); | ||
195 | slot = path->slots[0]; | ||
196 | if (slot >= nritems) { | ||
197 | ret = btrfs_next_leaf(root, path); | ||
198 | if (ret) | ||
199 | break; | ||
200 | leaf = path->nodes[0]; | ||
201 | nritems = btrfs_header_nritems(leaf); | ||
202 | slot = path->slots[0]; | ||
203 | } | ||
204 | item = btrfs_item_nr(leaf, slot); | ||
205 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
206 | if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) | ||
207 | goto next; | ||
208 | |||
209 | if (key.objectid < objectid) | ||
210 | goto next; | ||
211 | |||
212 | if (key.objectid > objectid) | ||
213 | break; | ||
214 | |||
215 | ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); | ||
216 | if (btrfs_disk_root_refs(leaf, ri) != 0) | ||
217 | goto next; | ||
218 | |||
219 | memcpy(&found_key, &key, sizeof(key)); | ||
220 | key.offset++; | ||
221 | btrfs_release_path(root, path); | ||
222 | dead_root = | ||
223 | btrfs_read_fs_root_no_radix(root->fs_info->tree_root, | ||
224 | &found_key); | ||
225 | if (IS_ERR(dead_root)) { | ||
226 | ret = PTR_ERR(dead_root); | ||
227 | goto err; | ||
228 | } | ||
229 | |||
230 | if (objectid == BTRFS_TREE_RELOC_OBJECTID) | ||
231 | ret = btrfs_add_dead_reloc_root(dead_root); | ||
232 | else | ||
233 | ret = btrfs_add_dead_root(dead_root, latest); | ||
234 | if (ret) | ||
235 | goto err; | ||
236 | goto again; | ||
237 | next: | ||
238 | slot++; | ||
239 | path->slots[0]++; | ||
240 | } | ||
241 | ret = 0; | ||
242 | err: | ||
243 | btrfs_free_path(path); | ||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /* drop the root item for 'key' from 'root' */ | ||
248 | int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, | ||
249 | struct btrfs_key *key) | ||
250 | { | ||
251 | struct btrfs_path *path; | ||
252 | int ret; | ||
253 | u32 refs; | ||
254 | struct btrfs_root_item *ri; | ||
255 | struct extent_buffer *leaf; | ||
256 | |||
257 | path = btrfs_alloc_path(); | ||
258 | BUG_ON(!path); | ||
259 | ret = btrfs_search_slot(trans, root, key, path, -1, 1); | ||
260 | if (ret < 0) | ||
261 | goto out; | ||
262 | |||
263 | BUG_ON(ret != 0); | ||
264 | leaf = path->nodes[0]; | ||
265 | ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); | ||
266 | |||
267 | refs = btrfs_disk_root_refs(leaf, ri); | ||
268 | BUG_ON(refs != 0); | ||
269 | ret = btrfs_del_item(trans, root, path); | ||
270 | out: | ||
271 | btrfs_release_path(root, path); | ||
272 | btrfs_free_path(path); | ||
273 | return ret; | ||
274 | } | ||
275 | |||
276 | #if 0 /* this will get used when snapshot deletion is implemented */ | ||
277 | int btrfs_del_root_ref(struct btrfs_trans_handle *trans, | ||
278 | struct btrfs_root *tree_root, | ||
279 | u64 root_id, u8 type, u64 ref_id) | ||
280 | { | ||
281 | struct btrfs_key key; | ||
282 | int ret; | ||
283 | struct btrfs_path *path; | ||
284 | |||
285 | path = btrfs_alloc_path(); | ||
286 | |||
287 | key.objectid = root_id; | ||
288 | key.type = type; | ||
289 | key.offset = ref_id; | ||
290 | |||
291 | ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); | ||
292 | BUG_ON(ret); | ||
293 | |||
294 | ret = btrfs_del_item(trans, tree_root, path); | ||
295 | BUG_ON(ret); | ||
296 | |||
297 | btrfs_free_path(path); | ||
298 | return ret; | ||
299 | } | ||
300 | #endif | ||
301 | |||
302 | int btrfs_find_root_ref(struct btrfs_root *tree_root, | ||
303 | struct btrfs_path *path, | ||
304 | u64 root_id, u64 ref_id) | ||
305 | { | ||
306 | struct btrfs_key key; | ||
307 | int ret; | ||
308 | |||
309 | key.objectid = root_id; | ||
310 | key.type = BTRFS_ROOT_REF_KEY; | ||
311 | key.offset = ref_id; | ||
312 | |||
313 | ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); | ||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | |||
318 | /* | ||
319 | * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY | ||
320 | * or BTRFS_ROOT_BACKREF_KEY. | ||
321 | * | ||
322 | * The dirid, sequence, name and name_len refer to the directory entry | ||
323 | * that is referencing the root. | ||
324 | * | ||
325 | * For a forward ref, the root_id is the id of the tree referencing | ||
326 | * the root and ref_id is the id of the subvol or snapshot. | ||
327 | * | ||
328 | * For a back ref the root_id is the id of the subvol or snapshot and | ||
329 | * ref_id is the id of the tree referencing it. | ||
330 | */ | ||
331 | int btrfs_add_root_ref(struct btrfs_trans_handle *trans, | ||
332 | struct btrfs_root *tree_root, | ||
333 | u64 root_id, u8 type, u64 ref_id, | ||
334 | u64 dirid, u64 sequence, | ||
335 | const char *name, int name_len) | ||
336 | { | ||
337 | struct btrfs_key key; | ||
338 | int ret; | ||
339 | struct btrfs_path *path; | ||
340 | struct btrfs_root_ref *ref; | ||
341 | struct extent_buffer *leaf; | ||
342 | unsigned long ptr; | ||
343 | |||
344 | |||
345 | path = btrfs_alloc_path(); | ||
346 | |||
347 | key.objectid = root_id; | ||
348 | key.type = type; | ||
349 | key.offset = ref_id; | ||
350 | |||
351 | ret = btrfs_insert_empty_item(trans, tree_root, path, &key, | ||
352 | sizeof(*ref) + name_len); | ||
353 | BUG_ON(ret); | ||
354 | |||
355 | leaf = path->nodes[0]; | ||
356 | ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); | ||
357 | btrfs_set_root_ref_dirid(leaf, ref, dirid); | ||
358 | btrfs_set_root_ref_sequence(leaf, ref, sequence); | ||
359 | btrfs_set_root_ref_name_len(leaf, ref, name_len); | ||
360 | ptr = (unsigned long)(ref + 1); | ||
361 | write_extent_buffer(leaf, name, ptr, name_len); | ||
362 | btrfs_mark_buffer_dirty(leaf); | ||
363 | |||
364 | btrfs_free_path(path); | ||
365 | return ret; | ||
366 | } | ||
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 000000000000..c0f7ecaf1e79 --- /dev/null +++ b/fs/btrfs/struct-funcs.c | |||
@@ -0,0 +1,139 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/highmem.h> | ||
20 | |||
21 | /* this is some deeply nasty code. ctree.h has a different | ||
22 | * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef | ||
23 | * | ||
24 | * The end result is that anyone who #includes ctree.h gets a | ||
25 | * declaration for the btrfs_set_foo functions and btrfs_foo functions | ||
26 | * | ||
27 | * This file declares the macros and then #includes ctree.h, which results | ||
28 | * in cpp creating the function here based on the template below. | ||
29 | * | ||
30 | * These setget functions do all the extent_buffer related mapping | ||
31 | * required to efficiently read and write specific fields in the extent | ||
32 | * buffers. Every pointer to metadata items in btrfs is really just | ||
33 | * an unsigned long offset into the extent buffer which has been | ||
34 | * cast to a specific type. This gives us all the gcc type checking. | ||
35 | * | ||
36 | * The extent buffer api is used to do all the kmapping and page | ||
37 | * spanning work required to get extent buffers in highmem and have | ||
38 | * a metadata blocksize different from the page size. | ||
39 | * | ||
40 | * The macro starts with a simple function prototype declaration so that | ||
41 | * sparse won't complain about it being static. | ||
42 | */ | ||
43 | |||
44 | #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ | ||
45 | u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ | ||
46 | void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ | ||
47 | u##bits btrfs_##name(struct extent_buffer *eb, \ | ||
48 | type *s) \ | ||
49 | { \ | ||
50 | unsigned long part_offset = (unsigned long)s; \ | ||
51 | unsigned long offset = part_offset + offsetof(type, member); \ | ||
52 | type *p; \ | ||
53 | /* ugly, but we want the fast path here */ \ | ||
54 | if (eb->map_token && offset >= eb->map_start && \ | ||
55 | offset + sizeof(((type *)0)->member) <= eb->map_start + \ | ||
56 | eb->map_len) { \ | ||
57 | p = (type *)(eb->kaddr + part_offset - eb->map_start); \ | ||
58 | return le##bits##_to_cpu(p->member); \ | ||
59 | } \ | ||
60 | { \ | ||
61 | int err; \ | ||
62 | char *map_token; \ | ||
63 | char *kaddr; \ | ||
64 | int unmap_on_exit = (eb->map_token == NULL); \ | ||
65 | unsigned long map_start; \ | ||
66 | unsigned long map_len; \ | ||
67 | u##bits res; \ | ||
68 | err = map_extent_buffer(eb, offset, \ | ||
69 | sizeof(((type *)0)->member), \ | ||
70 | &map_token, &kaddr, \ | ||
71 | &map_start, &map_len, KM_USER1); \ | ||
72 | if (err) { \ | ||
73 | __le##bits leres; \ | ||
74 | read_eb_member(eb, s, type, member, &leres); \ | ||
75 | return le##bits##_to_cpu(leres); \ | ||
76 | } \ | ||
77 | p = (type *)(kaddr + part_offset - map_start); \ | ||
78 | res = le##bits##_to_cpu(p->member); \ | ||
79 | if (unmap_on_exit) \ | ||
80 | unmap_extent_buffer(eb, map_token, KM_USER1); \ | ||
81 | return res; \ | ||
82 | } \ | ||
83 | } \ | ||
84 | void btrfs_set_##name(struct extent_buffer *eb, \ | ||
85 | type *s, u##bits val) \ | ||
86 | { \ | ||
87 | unsigned long part_offset = (unsigned long)s; \ | ||
88 | unsigned long offset = part_offset + offsetof(type, member); \ | ||
89 | type *p; \ | ||
90 | /* ugly, but we want the fast path here */ \ | ||
91 | if (eb->map_token && offset >= eb->map_start && \ | ||
92 | offset + sizeof(((type *)0)->member) <= eb->map_start + \ | ||
93 | eb->map_len) { \ | ||
94 | p = (type *)(eb->kaddr + part_offset - eb->map_start); \ | ||
95 | p->member = cpu_to_le##bits(val); \ | ||
96 | return; \ | ||
97 | } \ | ||
98 | { \ | ||
99 | int err; \ | ||
100 | char *map_token; \ | ||
101 | char *kaddr; \ | ||
102 | int unmap_on_exit = (eb->map_token == NULL); \ | ||
103 | unsigned long map_start; \ | ||
104 | unsigned long map_len; \ | ||
105 | err = map_extent_buffer(eb, offset, \ | ||
106 | sizeof(((type *)0)->member), \ | ||
107 | &map_token, &kaddr, \ | ||
108 | &map_start, &map_len, KM_USER1); \ | ||
109 | if (err) { \ | ||
110 | __le##bits val2; \ | ||
111 | val2 = cpu_to_le##bits(val); \ | ||
112 | write_eb_member(eb, s, type, member, &val2); \ | ||
113 | return; \ | ||
114 | } \ | ||
115 | p = (type *)(kaddr + part_offset - map_start); \ | ||
116 | p->member = cpu_to_le##bits(val); \ | ||
117 | if (unmap_on_exit) \ | ||
118 | unmap_extent_buffer(eb, map_token, KM_USER1); \ | ||
119 | } \ | ||
120 | } | ||
121 | |||
122 | #include "ctree.h" | ||
123 | |||
124 | void btrfs_node_key(struct extent_buffer *eb, | ||
125 | struct btrfs_disk_key *disk_key, int nr) | ||
126 | { | ||
127 | unsigned long ptr = btrfs_node_key_ptr_offset(nr); | ||
128 | if (eb->map_token && ptr >= eb->map_start && | ||
129 | ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { | ||
130 | memcpy(disk_key, eb->kaddr + ptr - eb->map_start, | ||
131 | sizeof(*disk_key)); | ||
132 | return; | ||
133 | } else if (eb->map_token) { | ||
134 | unmap_extent_buffer(eb, eb->map_token, KM_USER1); | ||
135 | eb->map_token = NULL; | ||
136 | } | ||
137 | read_eb_member(eb, (struct btrfs_key_ptr *)ptr, | ||
138 | struct btrfs_key_ptr, key, disk_key); | ||
139 | } | ||
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 000000000000..0a14b495532f --- /dev/null +++ b/fs/btrfs/super.c | |||
@@ -0,0 +1,722 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/blkdev.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/pagemap.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/time.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/smp_lock.h> | ||
29 | #include <linux/backing-dev.h> | ||
30 | #include <linux/mount.h> | ||
31 | #include <linux/mpage.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/writeback.h> | ||
34 | #include <linux/statfs.h> | ||
35 | #include <linux/compat.h> | ||
36 | #include <linux/parser.h> | ||
37 | #include <linux/ctype.h> | ||
38 | #include <linux/namei.h> | ||
39 | #include <linux/miscdevice.h> | ||
40 | #include <linux/version.h> | ||
41 | #include "compat.h" | ||
42 | #include "ctree.h" | ||
43 | #include "disk-io.h" | ||
44 | #include "transaction.h" | ||
45 | #include "btrfs_inode.h" | ||
46 | #include "ioctl.h" | ||
47 | #include "print-tree.h" | ||
48 | #include "xattr.h" | ||
49 | #include "volumes.h" | ||
50 | #include "version.h" | ||
51 | #include "export.h" | ||
52 | #include "compression.h" | ||
53 | |||
54 | #define BTRFS_SUPER_MAGIC 0x9123683E | ||
55 | |||
56 | static struct super_operations btrfs_super_ops; | ||
57 | |||
58 | static void btrfs_put_super(struct super_block *sb) | ||
59 | { | ||
60 | struct btrfs_root *root = btrfs_sb(sb); | ||
61 | int ret; | ||
62 | |||
63 | ret = close_ctree(root); | ||
64 | sb->s_fs_info = NULL; | ||
65 | } | ||
66 | |||
67 | enum { | ||
68 | Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, | ||
69 | Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, | ||
70 | Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, | ||
71 | }; | ||
72 | |||
73 | static match_table_t tokens = { | ||
74 | {Opt_degraded, "degraded"}, | ||
75 | {Opt_subvol, "subvol=%s"}, | ||
76 | {Opt_device, "device=%s"}, | ||
77 | {Opt_nodatasum, "nodatasum"}, | ||
78 | {Opt_nodatacow, "nodatacow"}, | ||
79 | {Opt_nobarrier, "nobarrier"}, | ||
80 | {Opt_max_extent, "max_extent=%s"}, | ||
81 | {Opt_max_inline, "max_inline=%s"}, | ||
82 | {Opt_alloc_start, "alloc_start=%s"}, | ||
83 | {Opt_thread_pool, "thread_pool=%d"}, | ||
84 | {Opt_compress, "compress"}, | ||
85 | {Opt_ssd, "ssd"}, | ||
86 | {Opt_noacl, "noacl"}, | ||
87 | {Opt_err, NULL}, | ||
88 | }; | ||
89 | |||
90 | u64 btrfs_parse_size(char *str) | ||
91 | { | ||
92 | u64 res; | ||
93 | int mult = 1; | ||
94 | char *end; | ||
95 | char last; | ||
96 | |||
97 | res = simple_strtoul(str, &end, 10); | ||
98 | |||
99 | last = end[0]; | ||
100 | if (isalpha(last)) { | ||
101 | last = tolower(last); | ||
102 | switch (last) { | ||
103 | case 'g': | ||
104 | mult *= 1024; | ||
105 | case 'm': | ||
106 | mult *= 1024; | ||
107 | case 'k': | ||
108 | mult *= 1024; | ||
109 | } | ||
110 | res = res * mult; | ||
111 | } | ||
112 | return res; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Regular mount options parser. Everything that is needed only when | ||
117 | * reading in a new superblock is parsed here. | ||
118 | */ | ||
119 | int btrfs_parse_options(struct btrfs_root *root, char *options) | ||
120 | { | ||
121 | struct btrfs_fs_info *info = root->fs_info; | ||
122 | substring_t args[MAX_OPT_ARGS]; | ||
123 | char *p, *num; | ||
124 | int intarg; | ||
125 | |||
126 | if (!options) | ||
127 | return 0; | ||
128 | |||
129 | /* | ||
130 | * strsep changes the string, duplicate it because parse_options | ||
131 | * gets called twice | ||
132 | */ | ||
133 | options = kstrdup(options, GFP_NOFS); | ||
134 | if (!options) | ||
135 | return -ENOMEM; | ||
136 | |||
137 | |||
138 | while ((p = strsep(&options, ",")) != NULL) { | ||
139 | int token; | ||
140 | if (!*p) | ||
141 | continue; | ||
142 | |||
143 | token = match_token(p, tokens, args); | ||
144 | switch (token) { | ||
145 | case Opt_degraded: | ||
146 | printk(KERN_INFO "btrfs: allowing degraded mounts\n"); | ||
147 | btrfs_set_opt(info->mount_opt, DEGRADED); | ||
148 | break; | ||
149 | case Opt_subvol: | ||
150 | case Opt_device: | ||
151 | /* | ||
152 | * These are parsed by btrfs_parse_early_options | ||
153 | * and can be happily ignored here. | ||
154 | */ | ||
155 | break; | ||
156 | case Opt_nodatasum: | ||
157 | printk(KERN_INFO "btrfs: setting nodatacsum\n"); | ||
158 | btrfs_set_opt(info->mount_opt, NODATASUM); | ||
159 | break; | ||
160 | case Opt_nodatacow: | ||
161 | printk(KERN_INFO "btrfs: setting nodatacow\n"); | ||
162 | btrfs_set_opt(info->mount_opt, NODATACOW); | ||
163 | btrfs_set_opt(info->mount_opt, NODATASUM); | ||
164 | break; | ||
165 | case Opt_compress: | ||
166 | printk(KERN_INFO "btrfs: use compression\n"); | ||
167 | btrfs_set_opt(info->mount_opt, COMPRESS); | ||
168 | break; | ||
169 | case Opt_ssd: | ||
170 | printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); | ||
171 | btrfs_set_opt(info->mount_opt, SSD); | ||
172 | break; | ||
173 | case Opt_nobarrier: | ||
174 | printk(KERN_INFO "btrfs: turning off barriers\n"); | ||
175 | btrfs_set_opt(info->mount_opt, NOBARRIER); | ||
176 | break; | ||
177 | case Opt_thread_pool: | ||
178 | intarg = 0; | ||
179 | match_int(&args[0], &intarg); | ||
180 | if (intarg) { | ||
181 | info->thread_pool_size = intarg; | ||
182 | printk(KERN_INFO "btrfs: thread pool %d\n", | ||
183 | info->thread_pool_size); | ||
184 | } | ||
185 | break; | ||
186 | case Opt_max_extent: | ||
187 | num = match_strdup(&args[0]); | ||
188 | if (num) { | ||
189 | info->max_extent = btrfs_parse_size(num); | ||
190 | kfree(num); | ||
191 | |||
192 | info->max_extent = max_t(u64, | ||
193 | info->max_extent, root->sectorsize); | ||
194 | printk(KERN_INFO "btrfs: max_extent at %llu\n", | ||
195 | info->max_extent); | ||
196 | } | ||
197 | break; | ||
198 | case Opt_max_inline: | ||
199 | num = match_strdup(&args[0]); | ||
200 | if (num) { | ||
201 | info->max_inline = btrfs_parse_size(num); | ||
202 | kfree(num); | ||
203 | |||
204 | if (info->max_inline) { | ||
205 | info->max_inline = max_t(u64, | ||
206 | info->max_inline, | ||
207 | root->sectorsize); | ||
208 | } | ||
209 | printk(KERN_INFO "btrfs: max_inline at %llu\n", | ||
210 | info->max_inline); | ||
211 | } | ||
212 | break; | ||
213 | case Opt_alloc_start: | ||
214 | num = match_strdup(&args[0]); | ||
215 | if (num) { | ||
216 | info->alloc_start = btrfs_parse_size(num); | ||
217 | kfree(num); | ||
218 | printk(KERN_INFO | ||
219 | "btrfs: allocations start at %llu\n", | ||
220 | info->alloc_start); | ||
221 | } | ||
222 | break; | ||
223 | case Opt_noacl: | ||
224 | root->fs_info->sb->s_flags &= ~MS_POSIXACL; | ||
225 | break; | ||
226 | default: | ||
227 | break; | ||
228 | } | ||
229 | } | ||
230 | kfree(options); | ||
231 | return 0; | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * Parse mount options that are required early in the mount process. | ||
236 | * | ||
237 | * All other options will be parsed on much later in the mount process and | ||
238 | * only when we need to allocate a new super block. | ||
239 | */ | ||
240 | static int btrfs_parse_early_options(const char *options, fmode_t flags, | ||
241 | void *holder, char **subvol_name, | ||
242 | struct btrfs_fs_devices **fs_devices) | ||
243 | { | ||
244 | substring_t args[MAX_OPT_ARGS]; | ||
245 | char *opts, *p; | ||
246 | int error = 0; | ||
247 | |||
248 | if (!options) | ||
249 | goto out; | ||
250 | |||
251 | /* | ||
252 | * strsep changes the string, duplicate it because parse_options | ||
253 | * gets called twice | ||
254 | */ | ||
255 | opts = kstrdup(options, GFP_KERNEL); | ||
256 | if (!opts) | ||
257 | return -ENOMEM; | ||
258 | |||
259 | while ((p = strsep(&opts, ",")) != NULL) { | ||
260 | int token; | ||
261 | if (!*p) | ||
262 | continue; | ||
263 | |||
264 | token = match_token(p, tokens, args); | ||
265 | switch (token) { | ||
266 | case Opt_subvol: | ||
267 | *subvol_name = match_strdup(&args[0]); | ||
268 | break; | ||
269 | case Opt_device: | ||
270 | error = btrfs_scan_one_device(match_strdup(&args[0]), | ||
271 | flags, holder, fs_devices); | ||
272 | if (error) | ||
273 | goto out_free_opts; | ||
274 | break; | ||
275 | default: | ||
276 | break; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | out_free_opts: | ||
281 | kfree(opts); | ||
282 | out: | ||
283 | /* | ||
284 | * If no subvolume name is specified we use the default one. Allocate | ||
285 | * a copy of the string "." here so that code later in the | ||
286 | * mount path doesn't care if it's the default volume or another one. | ||
287 | */ | ||
288 | if (!*subvol_name) { | ||
289 | *subvol_name = kstrdup(".", GFP_KERNEL); | ||
290 | if (!*subvol_name) | ||
291 | return -ENOMEM; | ||
292 | } | ||
293 | return error; | ||
294 | } | ||
295 | |||
296 | static int btrfs_fill_super(struct super_block *sb, | ||
297 | struct btrfs_fs_devices *fs_devices, | ||
298 | void *data, int silent) | ||
299 | { | ||
300 | struct inode *inode; | ||
301 | struct dentry *root_dentry; | ||
302 | struct btrfs_super_block *disk_super; | ||
303 | struct btrfs_root *tree_root; | ||
304 | struct btrfs_inode *bi; | ||
305 | int err; | ||
306 | |||
307 | sb->s_maxbytes = MAX_LFS_FILESIZE; | ||
308 | sb->s_magic = BTRFS_SUPER_MAGIC; | ||
309 | sb->s_op = &btrfs_super_ops; | ||
310 | sb->s_export_op = &btrfs_export_ops; | ||
311 | sb->s_xattr = btrfs_xattr_handlers; | ||
312 | sb->s_time_gran = 1; | ||
313 | sb->s_flags |= MS_POSIXACL; | ||
314 | |||
315 | tree_root = open_ctree(sb, fs_devices, (char *)data); | ||
316 | |||
317 | if (IS_ERR(tree_root)) { | ||
318 | printk("btrfs: open_ctree failed\n"); | ||
319 | return PTR_ERR(tree_root); | ||
320 | } | ||
321 | sb->s_fs_info = tree_root; | ||
322 | disk_super = &tree_root->fs_info->super_copy; | ||
323 | inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, | ||
324 | tree_root->fs_info->fs_root); | ||
325 | bi = BTRFS_I(inode); | ||
326 | bi->location.objectid = inode->i_ino; | ||
327 | bi->location.offset = 0; | ||
328 | bi->root = tree_root->fs_info->fs_root; | ||
329 | |||
330 | btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); | ||
331 | |||
332 | if (!inode) { | ||
333 | err = -ENOMEM; | ||
334 | goto fail_close; | ||
335 | } | ||
336 | if (inode->i_state & I_NEW) { | ||
337 | btrfs_read_locked_inode(inode); | ||
338 | unlock_new_inode(inode); | ||
339 | } | ||
340 | |||
341 | root_dentry = d_alloc_root(inode); | ||
342 | if (!root_dentry) { | ||
343 | iput(inode); | ||
344 | err = -ENOMEM; | ||
345 | goto fail_close; | ||
346 | } | ||
347 | #if 0 | ||
348 | /* this does the super kobj at the same time */ | ||
349 | err = btrfs_sysfs_add_super(tree_root->fs_info); | ||
350 | if (err) | ||
351 | goto fail_close; | ||
352 | #endif | ||
353 | |||
354 | sb->s_root = root_dentry; | ||
355 | |||
356 | save_mount_options(sb, data); | ||
357 | return 0; | ||
358 | |||
359 | fail_close: | ||
360 | close_ctree(tree_root); | ||
361 | return err; | ||
362 | } | ||
363 | |||
364 | int btrfs_sync_fs(struct super_block *sb, int wait) | ||
365 | { | ||
366 | struct btrfs_trans_handle *trans; | ||
367 | struct btrfs_root *root; | ||
368 | int ret; | ||
369 | root = btrfs_sb(sb); | ||
370 | |||
371 | if (sb->s_flags & MS_RDONLY) | ||
372 | return 0; | ||
373 | |||
374 | sb->s_dirt = 0; | ||
375 | if (!wait) { | ||
376 | filemap_flush(root->fs_info->btree_inode->i_mapping); | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | btrfs_start_delalloc_inodes(root); | ||
381 | btrfs_wait_ordered_extents(root, 0); | ||
382 | |||
383 | btrfs_clean_old_snapshots(root); | ||
384 | trans = btrfs_start_transaction(root, 1); | ||
385 | ret = btrfs_commit_transaction(trans, root); | ||
386 | sb->s_dirt = 0; | ||
387 | return ret; | ||
388 | } | ||
389 | |||
390 | static void btrfs_write_super(struct super_block *sb) | ||
391 | { | ||
392 | sb->s_dirt = 0; | ||
393 | } | ||
394 | |||
395 | static int btrfs_test_super(struct super_block *s, void *data) | ||
396 | { | ||
397 | struct btrfs_fs_devices *test_fs_devices = data; | ||
398 | struct btrfs_root *root = btrfs_sb(s); | ||
399 | |||
400 | return root->fs_info->fs_devices == test_fs_devices; | ||
401 | } | ||
402 | |||
403 | /* | ||
404 | * Find a superblock for the given device / mount point. | ||
405 | * | ||
406 | * Note: This is based on get_sb_bdev from fs/super.c with a few additions | ||
407 | * for multiple device setup. Make sure to keep it in sync. | ||
408 | */ | ||
409 | static int btrfs_get_sb(struct file_system_type *fs_type, int flags, | ||
410 | const char *dev_name, void *data, struct vfsmount *mnt) | ||
411 | { | ||
412 | char *subvol_name = NULL; | ||
413 | struct block_device *bdev = NULL; | ||
414 | struct super_block *s; | ||
415 | struct dentry *root; | ||
416 | struct btrfs_fs_devices *fs_devices = NULL; | ||
417 | fmode_t mode = FMODE_READ; | ||
418 | int error = 0; | ||
419 | |||
420 | if (!(flags & MS_RDONLY)) | ||
421 | mode |= FMODE_WRITE; | ||
422 | |||
423 | error = btrfs_parse_early_options(data, mode, fs_type, | ||
424 | &subvol_name, &fs_devices); | ||
425 | if (error) | ||
426 | return error; | ||
427 | |||
428 | error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); | ||
429 | if (error) | ||
430 | goto error_free_subvol_name; | ||
431 | |||
432 | error = btrfs_open_devices(fs_devices, mode, fs_type); | ||
433 | if (error) | ||
434 | goto error_free_subvol_name; | ||
435 | |||
436 | if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { | ||
437 | error = -EACCES; | ||
438 | goto error_close_devices; | ||
439 | } | ||
440 | |||
441 | bdev = fs_devices->latest_bdev; | ||
442 | s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); | ||
443 | if (IS_ERR(s)) | ||
444 | goto error_s; | ||
445 | |||
446 | if (s->s_root) { | ||
447 | if ((flags ^ s->s_flags) & MS_RDONLY) { | ||
448 | up_write(&s->s_umount); | ||
449 | deactivate_super(s); | ||
450 | error = -EBUSY; | ||
451 | goto error_close_devices; | ||
452 | } | ||
453 | |||
454 | btrfs_close_devices(fs_devices); | ||
455 | } else { | ||
456 | char b[BDEVNAME_SIZE]; | ||
457 | |||
458 | s->s_flags = flags; | ||
459 | strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); | ||
460 | error = btrfs_fill_super(s, fs_devices, data, | ||
461 | flags & MS_SILENT ? 1 : 0); | ||
462 | if (error) { | ||
463 | up_write(&s->s_umount); | ||
464 | deactivate_super(s); | ||
465 | goto error_free_subvol_name; | ||
466 | } | ||
467 | |||
468 | btrfs_sb(s)->fs_info->bdev_holder = fs_type; | ||
469 | s->s_flags |= MS_ACTIVE; | ||
470 | } | ||
471 | |||
472 | if (!strcmp(subvol_name, ".")) | ||
473 | root = dget(s->s_root); | ||
474 | else { | ||
475 | mutex_lock(&s->s_root->d_inode->i_mutex); | ||
476 | root = lookup_one_len(subvol_name, s->s_root, | ||
477 | strlen(subvol_name)); | ||
478 | mutex_unlock(&s->s_root->d_inode->i_mutex); | ||
479 | |||
480 | if (IS_ERR(root)) { | ||
481 | up_write(&s->s_umount); | ||
482 | deactivate_super(s); | ||
483 | error = PTR_ERR(root); | ||
484 | goto error_free_subvol_name; | ||
485 | } | ||
486 | if (!root->d_inode) { | ||
487 | dput(root); | ||
488 | up_write(&s->s_umount); | ||
489 | deactivate_super(s); | ||
490 | error = -ENXIO; | ||
491 | goto error_free_subvol_name; | ||
492 | } | ||
493 | } | ||
494 | |||
495 | mnt->mnt_sb = s; | ||
496 | mnt->mnt_root = root; | ||
497 | |||
498 | kfree(subvol_name); | ||
499 | return 0; | ||
500 | |||
501 | error_s: | ||
502 | error = PTR_ERR(s); | ||
503 | error_close_devices: | ||
504 | btrfs_close_devices(fs_devices); | ||
505 | error_free_subvol_name: | ||
506 | kfree(subvol_name); | ||
507 | return error; | ||
508 | } | ||
509 | |||
510 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) | ||
511 | { | ||
512 | struct btrfs_root *root = btrfs_sb(sb); | ||
513 | int ret; | ||
514 | |||
515 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | ||
516 | return 0; | ||
517 | |||
518 | if (*flags & MS_RDONLY) { | ||
519 | sb->s_flags |= MS_RDONLY; | ||
520 | |||
521 | ret = btrfs_commit_super(root); | ||
522 | WARN_ON(ret); | ||
523 | } else { | ||
524 | if (root->fs_info->fs_devices->rw_devices == 0) | ||
525 | return -EACCES; | ||
526 | |||
527 | if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) | ||
528 | return -EINVAL; | ||
529 | |||
530 | ret = btrfs_cleanup_reloc_trees(root); | ||
531 | WARN_ON(ret); | ||
532 | |||
533 | ret = btrfs_cleanup_fs_roots(root->fs_info); | ||
534 | WARN_ON(ret); | ||
535 | |||
536 | sb->s_flags &= ~MS_RDONLY; | ||
537 | } | ||
538 | |||
539 | return 0; | ||
540 | } | ||
541 | |||
542 | static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | ||
543 | { | ||
544 | struct btrfs_root *root = btrfs_sb(dentry->d_sb); | ||
545 | struct btrfs_super_block *disk_super = &root->fs_info->super_copy; | ||
546 | int bits = dentry->d_sb->s_blocksize_bits; | ||
547 | __be32 *fsid = (__be32 *)root->fs_info->fsid; | ||
548 | |||
549 | buf->f_namelen = BTRFS_NAME_LEN; | ||
550 | buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; | ||
551 | buf->f_bfree = buf->f_blocks - | ||
552 | (btrfs_super_bytes_used(disk_super) >> bits); | ||
553 | buf->f_bavail = buf->f_bfree; | ||
554 | buf->f_bsize = dentry->d_sb->s_blocksize; | ||
555 | buf->f_type = BTRFS_SUPER_MAGIC; | ||
556 | |||
557 | /* We treat it as constant endianness (it doesn't matter _which_) | ||
558 | because we want the fsid to come out the same whether mounted | ||
559 | on a big-endian or little-endian host */ | ||
560 | buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); | ||
561 | buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); | ||
562 | /* Mask in the root object ID too, to disambiguate subvols */ | ||
563 | buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; | ||
564 | buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; | ||
565 | |||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | static struct file_system_type btrfs_fs_type = { | ||
570 | .owner = THIS_MODULE, | ||
571 | .name = "btrfs", | ||
572 | .get_sb = btrfs_get_sb, | ||
573 | .kill_sb = kill_anon_super, | ||
574 | .fs_flags = FS_REQUIRES_DEV, | ||
575 | }; | ||
576 | |||
577 | /* | ||
578 | * used by btrfsctl to scan devices when no FS is mounted | ||
579 | */ | ||
580 | static long btrfs_control_ioctl(struct file *file, unsigned int cmd, | ||
581 | unsigned long arg) | ||
582 | { | ||
583 | struct btrfs_ioctl_vol_args *vol; | ||
584 | struct btrfs_fs_devices *fs_devices; | ||
585 | int ret = 0; | ||
586 | int len; | ||
587 | |||
588 | if (!capable(CAP_SYS_ADMIN)) | ||
589 | return -EPERM; | ||
590 | |||
591 | vol = kmalloc(sizeof(*vol), GFP_KERNEL); | ||
592 | if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { | ||
593 | ret = -EFAULT; | ||
594 | goto out; | ||
595 | } | ||
596 | len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); | ||
597 | switch (cmd) { | ||
598 | case BTRFS_IOC_SCAN_DEV: | ||
599 | ret = btrfs_scan_one_device(vol->name, FMODE_READ, | ||
600 | &btrfs_fs_type, &fs_devices); | ||
601 | break; | ||
602 | } | ||
603 | out: | ||
604 | kfree(vol); | ||
605 | return ret; | ||
606 | } | ||
607 | |||
608 | static int btrfs_freeze(struct super_block *sb) | ||
609 | { | ||
610 | struct btrfs_root *root = btrfs_sb(sb); | ||
611 | mutex_lock(&root->fs_info->transaction_kthread_mutex); | ||
612 | mutex_lock(&root->fs_info->cleaner_mutex); | ||
613 | return 0; | ||
614 | } | ||
615 | |||
616 | static int btrfs_unfreeze(struct super_block *sb) | ||
617 | { | ||
618 | struct btrfs_root *root = btrfs_sb(sb); | ||
619 | mutex_unlock(&root->fs_info->cleaner_mutex); | ||
620 | mutex_unlock(&root->fs_info->transaction_kthread_mutex); | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | static struct super_operations btrfs_super_ops = { | ||
625 | .delete_inode = btrfs_delete_inode, | ||
626 | .put_super = btrfs_put_super, | ||
627 | .write_super = btrfs_write_super, | ||
628 | .sync_fs = btrfs_sync_fs, | ||
629 | .show_options = generic_show_options, | ||
630 | .write_inode = btrfs_write_inode, | ||
631 | .dirty_inode = btrfs_dirty_inode, | ||
632 | .alloc_inode = btrfs_alloc_inode, | ||
633 | .destroy_inode = btrfs_destroy_inode, | ||
634 | .statfs = btrfs_statfs, | ||
635 | .remount_fs = btrfs_remount, | ||
636 | .freeze_fs = btrfs_freeze, | ||
637 | .unfreeze_fs = btrfs_unfreeze, | ||
638 | }; | ||
639 | |||
640 | static const struct file_operations btrfs_ctl_fops = { | ||
641 | .unlocked_ioctl = btrfs_control_ioctl, | ||
642 | .compat_ioctl = btrfs_control_ioctl, | ||
643 | .owner = THIS_MODULE, | ||
644 | }; | ||
645 | |||
646 | static struct miscdevice btrfs_misc = { | ||
647 | .minor = MISC_DYNAMIC_MINOR, | ||
648 | .name = "btrfs-control", | ||
649 | .fops = &btrfs_ctl_fops | ||
650 | }; | ||
651 | |||
652 | static int btrfs_interface_init(void) | ||
653 | { | ||
654 | return misc_register(&btrfs_misc); | ||
655 | } | ||
656 | |||
657 | static void btrfs_interface_exit(void) | ||
658 | { | ||
659 | if (misc_deregister(&btrfs_misc) < 0) | ||
660 | printk(KERN_INFO "misc_deregister failed for control device"); | ||
661 | } | ||
662 | |||
663 | static int __init init_btrfs_fs(void) | ||
664 | { | ||
665 | int err; | ||
666 | |||
667 | err = btrfs_init_sysfs(); | ||
668 | if (err) | ||
669 | return err; | ||
670 | |||
671 | err = btrfs_init_cachep(); | ||
672 | if (err) | ||
673 | goto free_sysfs; | ||
674 | |||
675 | err = extent_io_init(); | ||
676 | if (err) | ||
677 | goto free_cachep; | ||
678 | |||
679 | err = extent_map_init(); | ||
680 | if (err) | ||
681 | goto free_extent_io; | ||
682 | |||
683 | err = btrfs_interface_init(); | ||
684 | if (err) | ||
685 | goto free_extent_map; | ||
686 | |||
687 | err = register_filesystem(&btrfs_fs_type); | ||
688 | if (err) | ||
689 | goto unregister_ioctl; | ||
690 | |||
691 | printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); | ||
692 | return 0; | ||
693 | |||
694 | unregister_ioctl: | ||
695 | btrfs_interface_exit(); | ||
696 | free_extent_map: | ||
697 | extent_map_exit(); | ||
698 | free_extent_io: | ||
699 | extent_io_exit(); | ||
700 | free_cachep: | ||
701 | btrfs_destroy_cachep(); | ||
702 | free_sysfs: | ||
703 | btrfs_exit_sysfs(); | ||
704 | return err; | ||
705 | } | ||
706 | |||
707 | static void __exit exit_btrfs_fs(void) | ||
708 | { | ||
709 | btrfs_destroy_cachep(); | ||
710 | extent_map_exit(); | ||
711 | extent_io_exit(); | ||
712 | btrfs_interface_exit(); | ||
713 | unregister_filesystem(&btrfs_fs_type); | ||
714 | btrfs_exit_sysfs(); | ||
715 | btrfs_cleanup_fs_uuids(); | ||
716 | btrfs_zlib_exit(); | ||
717 | } | ||
718 | |||
719 | module_init(init_btrfs_fs) | ||
720 | module_exit(exit_btrfs_fs) | ||
721 | |||
722 | MODULE_LICENSE("GPL"); | ||
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 000000000000..a240b6fa81df --- /dev/null +++ b/fs/btrfs/sysfs.c | |||
@@ -0,0 +1,269 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/spinlock.h> | ||
22 | #include <linux/completion.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/kobject.h> | ||
26 | |||
27 | #include "ctree.h" | ||
28 | #include "disk-io.h" | ||
29 | #include "transaction.h" | ||
30 | |||
31 | static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) | ||
32 | { | ||
33 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
34 | (unsigned long long)btrfs_root_used(&root->root_item)); | ||
35 | } | ||
36 | |||
37 | static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) | ||
38 | { | ||
39 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
40 | (unsigned long long)btrfs_root_limit(&root->root_item)); | ||
41 | } | ||
42 | |||
43 | static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) | ||
44 | { | ||
45 | |||
46 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
47 | (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); | ||
48 | } | ||
49 | |||
50 | static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) | ||
51 | { | ||
52 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
53 | (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); | ||
54 | } | ||
55 | |||
56 | static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) | ||
57 | { | ||
58 | return snprintf(buf, PAGE_SIZE, "%llu\n", | ||
59 | (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); | ||
60 | } | ||
61 | |||
62 | /* this is for root attrs (subvols/snapshots) */ | ||
63 | struct btrfs_root_attr { | ||
64 | struct attribute attr; | ||
65 | ssize_t (*show)(struct btrfs_root *, char *); | ||
66 | ssize_t (*store)(struct btrfs_root *, const char *, size_t); | ||
67 | }; | ||
68 | |||
69 | #define ROOT_ATTR(name, mode, show, store) \ | ||
70 | static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ | ||
71 | show, store) | ||
72 | |||
73 | ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); | ||
74 | ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); | ||
75 | |||
76 | static struct attribute *btrfs_root_attrs[] = { | ||
77 | &btrfs_root_attr_blocks_used.attr, | ||
78 | &btrfs_root_attr_block_limit.attr, | ||
79 | NULL, | ||
80 | }; | ||
81 | |||
82 | /* this is for super attrs (actual full fs) */ | ||
83 | struct btrfs_super_attr { | ||
84 | struct attribute attr; | ||
85 | ssize_t (*show)(struct btrfs_fs_info *, char *); | ||
86 | ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); | ||
87 | }; | ||
88 | |||
89 | #define SUPER_ATTR(name, mode, show, store) \ | ||
90 | static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ | ||
91 | show, store) | ||
92 | |||
93 | SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); | ||
94 | SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); | ||
95 | SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); | ||
96 | |||
97 | static struct attribute *btrfs_super_attrs[] = { | ||
98 | &btrfs_super_attr_blocks_used.attr, | ||
99 | &btrfs_super_attr_total_blocks.attr, | ||
100 | &btrfs_super_attr_blocksize.attr, | ||
101 | NULL, | ||
102 | }; | ||
103 | |||
104 | static ssize_t btrfs_super_attr_show(struct kobject *kobj, | ||
105 | struct attribute *attr, char *buf) | ||
106 | { | ||
107 | struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, | ||
108 | super_kobj); | ||
109 | struct btrfs_super_attr *a = container_of(attr, | ||
110 | struct btrfs_super_attr, | ||
111 | attr); | ||
112 | |||
113 | return a->show ? a->show(fs, buf) : 0; | ||
114 | } | ||
115 | |||
116 | static ssize_t btrfs_super_attr_store(struct kobject *kobj, | ||
117 | struct attribute *attr, | ||
118 | const char *buf, size_t len) | ||
119 | { | ||
120 | struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, | ||
121 | super_kobj); | ||
122 | struct btrfs_super_attr *a = container_of(attr, | ||
123 | struct btrfs_super_attr, | ||
124 | attr); | ||
125 | |||
126 | return a->store ? a->store(fs, buf, len) : 0; | ||
127 | } | ||
128 | |||
129 | static ssize_t btrfs_root_attr_show(struct kobject *kobj, | ||
130 | struct attribute *attr, char *buf) | ||
131 | { | ||
132 | struct btrfs_root *root = container_of(kobj, struct btrfs_root, | ||
133 | root_kobj); | ||
134 | struct btrfs_root_attr *a = container_of(attr, | ||
135 | struct btrfs_root_attr, | ||
136 | attr); | ||
137 | |||
138 | return a->show ? a->show(root, buf) : 0; | ||
139 | } | ||
140 | |||
141 | static ssize_t btrfs_root_attr_store(struct kobject *kobj, | ||
142 | struct attribute *attr, | ||
143 | const char *buf, size_t len) | ||
144 | { | ||
145 | struct btrfs_root *root = container_of(kobj, struct btrfs_root, | ||
146 | root_kobj); | ||
147 | struct btrfs_root_attr *a = container_of(attr, | ||
148 | struct btrfs_root_attr, | ||
149 | attr); | ||
150 | return a->store ? a->store(root, buf, len) : 0; | ||
151 | } | ||
152 | |||
153 | static void btrfs_super_release(struct kobject *kobj) | ||
154 | { | ||
155 | struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, | ||
156 | super_kobj); | ||
157 | complete(&fs->kobj_unregister); | ||
158 | } | ||
159 | |||
160 | static void btrfs_root_release(struct kobject *kobj) | ||
161 | { | ||
162 | struct btrfs_root *root = container_of(kobj, struct btrfs_root, | ||
163 | root_kobj); | ||
164 | complete(&root->kobj_unregister); | ||
165 | } | ||
166 | |||
167 | static struct sysfs_ops btrfs_super_attr_ops = { | ||
168 | .show = btrfs_super_attr_show, | ||
169 | .store = btrfs_super_attr_store, | ||
170 | }; | ||
171 | |||
172 | static struct sysfs_ops btrfs_root_attr_ops = { | ||
173 | .show = btrfs_root_attr_show, | ||
174 | .store = btrfs_root_attr_store, | ||
175 | }; | ||
176 | |||
177 | static struct kobj_type btrfs_root_ktype = { | ||
178 | .default_attrs = btrfs_root_attrs, | ||
179 | .sysfs_ops = &btrfs_root_attr_ops, | ||
180 | .release = btrfs_root_release, | ||
181 | }; | ||
182 | |||
183 | static struct kobj_type btrfs_super_ktype = { | ||
184 | .default_attrs = btrfs_super_attrs, | ||
185 | .sysfs_ops = &btrfs_super_attr_ops, | ||
186 | .release = btrfs_super_release, | ||
187 | }; | ||
188 | |||
189 | /* /sys/fs/btrfs/ entry */ | ||
190 | static struct kset *btrfs_kset; | ||
191 | |||
192 | int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) | ||
193 | { | ||
194 | int error; | ||
195 | char *name; | ||
196 | char c; | ||
197 | int len = strlen(fs->sb->s_id) + 1; | ||
198 | int i; | ||
199 | |||
200 | name = kmalloc(len, GFP_NOFS); | ||
201 | if (!name) { | ||
202 | error = -ENOMEM; | ||
203 | goto fail; | ||
204 | } | ||
205 | |||
206 | for (i = 0; i < len; i++) { | ||
207 | c = fs->sb->s_id[i]; | ||
208 | if (c == '/' || c == '\\') | ||
209 | c = '!'; | ||
210 | name[i] = c; | ||
211 | } | ||
212 | name[len] = '\0'; | ||
213 | |||
214 | fs->super_kobj.kset = btrfs_kset; | ||
215 | error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, | ||
216 | NULL, "%s", name); | ||
217 | kfree(name); | ||
218 | if (error) | ||
219 | goto fail; | ||
220 | |||
221 | return 0; | ||
222 | |||
223 | fail: | ||
224 | printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); | ||
225 | return error; | ||
226 | } | ||
227 | |||
228 | int btrfs_sysfs_add_root(struct btrfs_root *root) | ||
229 | { | ||
230 | int error; | ||
231 | |||
232 | error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, | ||
233 | &root->fs_info->super_kobj, | ||
234 | "%s", root->name); | ||
235 | if (error) | ||
236 | goto fail; | ||
237 | |||
238 | return 0; | ||
239 | |||
240 | fail: | ||
241 | printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); | ||
242 | return error; | ||
243 | } | ||
244 | |||
245 | void btrfs_sysfs_del_root(struct btrfs_root *root) | ||
246 | { | ||
247 | kobject_put(&root->root_kobj); | ||
248 | wait_for_completion(&root->kobj_unregister); | ||
249 | } | ||
250 | |||
251 | void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) | ||
252 | { | ||
253 | kobject_put(&fs->super_kobj); | ||
254 | wait_for_completion(&fs->kobj_unregister); | ||
255 | } | ||
256 | |||
257 | int btrfs_init_sysfs(void) | ||
258 | { | ||
259 | btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); | ||
260 | if (!btrfs_kset) | ||
261 | return -ENOMEM; | ||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | void btrfs_exit_sysfs(void) | ||
266 | { | ||
267 | kset_unregister(btrfs_kset); | ||
268 | } | ||
269 | |||
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 000000000000..8a08f9443340 --- /dev/null +++ b/fs/btrfs/transaction.c | |||
@@ -0,0 +1,1097 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/writeback.h> | ||
22 | #include <linux/pagemap.h> | ||
23 | #include <linux/blkdev.h> | ||
24 | #include "ctree.h" | ||
25 | #include "disk-io.h" | ||
26 | #include "transaction.h" | ||
27 | #include "locking.h" | ||
28 | #include "ref-cache.h" | ||
29 | #include "tree-log.h" | ||
30 | |||
31 | #define BTRFS_ROOT_TRANS_TAG 0 | ||
32 | |||
33 | static noinline void put_transaction(struct btrfs_transaction *transaction) | ||
34 | { | ||
35 | WARN_ON(transaction->use_count == 0); | ||
36 | transaction->use_count--; | ||
37 | if (transaction->use_count == 0) { | ||
38 | list_del_init(&transaction->list); | ||
39 | memset(transaction, 0, sizeof(*transaction)); | ||
40 | kmem_cache_free(btrfs_transaction_cachep, transaction); | ||
41 | } | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * either allocate a new transaction or hop into the existing one | ||
46 | */ | ||
47 | static noinline int join_transaction(struct btrfs_root *root) | ||
48 | { | ||
49 | struct btrfs_transaction *cur_trans; | ||
50 | cur_trans = root->fs_info->running_transaction; | ||
51 | if (!cur_trans) { | ||
52 | cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, | ||
53 | GFP_NOFS); | ||
54 | BUG_ON(!cur_trans); | ||
55 | root->fs_info->generation++; | ||
56 | root->fs_info->last_alloc = 0; | ||
57 | root->fs_info->last_data_alloc = 0; | ||
58 | cur_trans->num_writers = 1; | ||
59 | cur_trans->num_joined = 0; | ||
60 | cur_trans->transid = root->fs_info->generation; | ||
61 | init_waitqueue_head(&cur_trans->writer_wait); | ||
62 | init_waitqueue_head(&cur_trans->commit_wait); | ||
63 | cur_trans->in_commit = 0; | ||
64 | cur_trans->blocked = 0; | ||
65 | cur_trans->use_count = 1; | ||
66 | cur_trans->commit_done = 0; | ||
67 | cur_trans->start_time = get_seconds(); | ||
68 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | ||
69 | list_add_tail(&cur_trans->list, &root->fs_info->trans_list); | ||
70 | extent_io_tree_init(&cur_trans->dirty_pages, | ||
71 | root->fs_info->btree_inode->i_mapping, | ||
72 | GFP_NOFS); | ||
73 | spin_lock(&root->fs_info->new_trans_lock); | ||
74 | root->fs_info->running_transaction = cur_trans; | ||
75 | spin_unlock(&root->fs_info->new_trans_lock); | ||
76 | } else { | ||
77 | cur_trans->num_writers++; | ||
78 | cur_trans->num_joined++; | ||
79 | } | ||
80 | |||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * this does all the record keeping required to make sure that a reference | ||
86 | * counted root is properly recorded in a given transaction. This is required | ||
87 | * to make sure the old root from before we joined the transaction is deleted | ||
88 | * when the transaction commits | ||
89 | */ | ||
90 | noinline int btrfs_record_root_in_trans(struct btrfs_root *root) | ||
91 | { | ||
92 | struct btrfs_dirty_root *dirty; | ||
93 | u64 running_trans_id = root->fs_info->running_transaction->transid; | ||
94 | if (root->ref_cows && root->last_trans < running_trans_id) { | ||
95 | WARN_ON(root == root->fs_info->extent_root); | ||
96 | if (root->root_item.refs != 0) { | ||
97 | radix_tree_tag_set(&root->fs_info->fs_roots_radix, | ||
98 | (unsigned long)root->root_key.objectid, | ||
99 | BTRFS_ROOT_TRANS_TAG); | ||
100 | |||
101 | dirty = kmalloc(sizeof(*dirty), GFP_NOFS); | ||
102 | BUG_ON(!dirty); | ||
103 | dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); | ||
104 | BUG_ON(!dirty->root); | ||
105 | dirty->latest_root = root; | ||
106 | INIT_LIST_HEAD(&dirty->list); | ||
107 | |||
108 | root->commit_root = btrfs_root_node(root); | ||
109 | |||
110 | memcpy(dirty->root, root, sizeof(*root)); | ||
111 | spin_lock_init(&dirty->root->node_lock); | ||
112 | spin_lock_init(&dirty->root->list_lock); | ||
113 | mutex_init(&dirty->root->objectid_mutex); | ||
114 | mutex_init(&dirty->root->log_mutex); | ||
115 | INIT_LIST_HEAD(&dirty->root->dead_list); | ||
116 | dirty->root->node = root->commit_root; | ||
117 | dirty->root->commit_root = NULL; | ||
118 | |||
119 | spin_lock(&root->list_lock); | ||
120 | list_add(&dirty->root->dead_list, &root->dead_list); | ||
121 | spin_unlock(&root->list_lock); | ||
122 | |||
123 | root->dirty_root = dirty; | ||
124 | } else { | ||
125 | WARN_ON(1); | ||
126 | } | ||
127 | root->last_trans = running_trans_id; | ||
128 | } | ||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | /* wait for commit against the current transaction to become unblocked | ||
133 | * when this is done, it is safe to start a new transaction, but the current | ||
134 | * transaction might not be fully on disk. | ||
135 | */ | ||
136 | static void wait_current_trans(struct btrfs_root *root) | ||
137 | { | ||
138 | struct btrfs_transaction *cur_trans; | ||
139 | |||
140 | cur_trans = root->fs_info->running_transaction; | ||
141 | if (cur_trans && cur_trans->blocked) { | ||
142 | DEFINE_WAIT(wait); | ||
143 | cur_trans->use_count++; | ||
144 | while (1) { | ||
145 | prepare_to_wait(&root->fs_info->transaction_wait, &wait, | ||
146 | TASK_UNINTERRUPTIBLE); | ||
147 | if (cur_trans->blocked) { | ||
148 | mutex_unlock(&root->fs_info->trans_mutex); | ||
149 | schedule(); | ||
150 | mutex_lock(&root->fs_info->trans_mutex); | ||
151 | finish_wait(&root->fs_info->transaction_wait, | ||
152 | &wait); | ||
153 | } else { | ||
154 | finish_wait(&root->fs_info->transaction_wait, | ||
155 | &wait); | ||
156 | break; | ||
157 | } | ||
158 | } | ||
159 | put_transaction(cur_trans); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | ||
164 | int num_blocks, int wait) | ||
165 | { | ||
166 | struct btrfs_trans_handle *h = | ||
167 | kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); | ||
168 | int ret; | ||
169 | |||
170 | mutex_lock(&root->fs_info->trans_mutex); | ||
171 | if (!root->fs_info->log_root_recovering && | ||
172 | ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) | ||
173 | wait_current_trans(root); | ||
174 | ret = join_transaction(root); | ||
175 | BUG_ON(ret); | ||
176 | |||
177 | btrfs_record_root_in_trans(root); | ||
178 | h->transid = root->fs_info->running_transaction->transid; | ||
179 | h->transaction = root->fs_info->running_transaction; | ||
180 | h->blocks_reserved = num_blocks; | ||
181 | h->blocks_used = 0; | ||
182 | h->block_group = 0; | ||
183 | h->alloc_exclude_nr = 0; | ||
184 | h->alloc_exclude_start = 0; | ||
185 | root->fs_info->running_transaction->use_count++; | ||
186 | mutex_unlock(&root->fs_info->trans_mutex); | ||
187 | return h; | ||
188 | } | ||
189 | |||
190 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, | ||
191 | int num_blocks) | ||
192 | { | ||
193 | return start_transaction(root, num_blocks, 1); | ||
194 | } | ||
195 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, | ||
196 | int num_blocks) | ||
197 | { | ||
198 | return start_transaction(root, num_blocks, 0); | ||
199 | } | ||
200 | |||
201 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, | ||
202 | int num_blocks) | ||
203 | { | ||
204 | return start_transaction(r, num_blocks, 2); | ||
205 | } | ||
206 | |||
207 | /* wait for a transaction commit to be fully complete */ | ||
208 | static noinline int wait_for_commit(struct btrfs_root *root, | ||
209 | struct btrfs_transaction *commit) | ||
210 | { | ||
211 | DEFINE_WAIT(wait); | ||
212 | mutex_lock(&root->fs_info->trans_mutex); | ||
213 | while (!commit->commit_done) { | ||
214 | prepare_to_wait(&commit->commit_wait, &wait, | ||
215 | TASK_UNINTERRUPTIBLE); | ||
216 | if (commit->commit_done) | ||
217 | break; | ||
218 | mutex_unlock(&root->fs_info->trans_mutex); | ||
219 | schedule(); | ||
220 | mutex_lock(&root->fs_info->trans_mutex); | ||
221 | } | ||
222 | mutex_unlock(&root->fs_info->trans_mutex); | ||
223 | finish_wait(&commit->commit_wait, &wait); | ||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * rate limit against the drop_snapshot code. This helps to slow down new | ||
229 | * operations if the drop_snapshot code isn't able to keep up. | ||
230 | */ | ||
231 | static void throttle_on_drops(struct btrfs_root *root) | ||
232 | { | ||
233 | struct btrfs_fs_info *info = root->fs_info; | ||
234 | int harder_count = 0; | ||
235 | |||
236 | harder: | ||
237 | if (atomic_read(&info->throttles)) { | ||
238 | DEFINE_WAIT(wait); | ||
239 | int thr; | ||
240 | thr = atomic_read(&info->throttle_gen); | ||
241 | |||
242 | do { | ||
243 | prepare_to_wait(&info->transaction_throttle, | ||
244 | &wait, TASK_UNINTERRUPTIBLE); | ||
245 | if (!atomic_read(&info->throttles)) { | ||
246 | finish_wait(&info->transaction_throttle, &wait); | ||
247 | break; | ||
248 | } | ||
249 | schedule(); | ||
250 | finish_wait(&info->transaction_throttle, &wait); | ||
251 | } while (thr == atomic_read(&info->throttle_gen)); | ||
252 | harder_count++; | ||
253 | |||
254 | if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && | ||
255 | harder_count < 2) | ||
256 | goto harder; | ||
257 | |||
258 | if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && | ||
259 | harder_count < 10) | ||
260 | goto harder; | ||
261 | |||
262 | if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && | ||
263 | harder_count < 20) | ||
264 | goto harder; | ||
265 | } | ||
266 | } | ||
267 | |||
268 | void btrfs_throttle(struct btrfs_root *root) | ||
269 | { | ||
270 | mutex_lock(&root->fs_info->trans_mutex); | ||
271 | if (!root->fs_info->open_ioctl_trans) | ||
272 | wait_current_trans(root); | ||
273 | mutex_unlock(&root->fs_info->trans_mutex); | ||
274 | |||
275 | throttle_on_drops(root); | ||
276 | } | ||
277 | |||
278 | static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | ||
279 | struct btrfs_root *root, int throttle) | ||
280 | { | ||
281 | struct btrfs_transaction *cur_trans; | ||
282 | struct btrfs_fs_info *info = root->fs_info; | ||
283 | |||
284 | mutex_lock(&info->trans_mutex); | ||
285 | cur_trans = info->running_transaction; | ||
286 | WARN_ON(cur_trans != trans->transaction); | ||
287 | WARN_ON(cur_trans->num_writers < 1); | ||
288 | cur_trans->num_writers--; | ||
289 | |||
290 | if (waitqueue_active(&cur_trans->writer_wait)) | ||
291 | wake_up(&cur_trans->writer_wait); | ||
292 | put_transaction(cur_trans); | ||
293 | mutex_unlock(&info->trans_mutex); | ||
294 | memset(trans, 0, sizeof(*trans)); | ||
295 | kmem_cache_free(btrfs_trans_handle_cachep, trans); | ||
296 | |||
297 | if (throttle) | ||
298 | throttle_on_drops(root); | ||
299 | |||
300 | return 0; | ||
301 | } | ||
302 | |||
303 | int btrfs_end_transaction(struct btrfs_trans_handle *trans, | ||
304 | struct btrfs_root *root) | ||
305 | { | ||
306 | return __btrfs_end_transaction(trans, root, 0); | ||
307 | } | ||
308 | |||
309 | int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, | ||
310 | struct btrfs_root *root) | ||
311 | { | ||
312 | return __btrfs_end_transaction(trans, root, 1); | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * when btree blocks are allocated, they have some corresponding bits set for | ||
317 | * them in one of two extent_io trees. This is used to make sure all of | ||
318 | * those extents are on disk for transaction or log commit | ||
319 | */ | ||
320 | int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | ||
321 | struct extent_io_tree *dirty_pages) | ||
322 | { | ||
323 | int ret; | ||
324 | int err = 0; | ||
325 | int werr = 0; | ||
326 | struct page *page; | ||
327 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
328 | u64 start = 0; | ||
329 | u64 end; | ||
330 | unsigned long index; | ||
331 | |||
332 | while (1) { | ||
333 | ret = find_first_extent_bit(dirty_pages, start, &start, &end, | ||
334 | EXTENT_DIRTY); | ||
335 | if (ret) | ||
336 | break; | ||
337 | while (start <= end) { | ||
338 | cond_resched(); | ||
339 | |||
340 | index = start >> PAGE_CACHE_SHIFT; | ||
341 | start = (u64)(index + 1) << PAGE_CACHE_SHIFT; | ||
342 | page = find_get_page(btree_inode->i_mapping, index); | ||
343 | if (!page) | ||
344 | continue; | ||
345 | |||
346 | btree_lock_page_hook(page); | ||
347 | if (!page->mapping) { | ||
348 | unlock_page(page); | ||
349 | page_cache_release(page); | ||
350 | continue; | ||
351 | } | ||
352 | |||
353 | if (PageWriteback(page)) { | ||
354 | if (PageDirty(page)) | ||
355 | wait_on_page_writeback(page); | ||
356 | else { | ||
357 | unlock_page(page); | ||
358 | page_cache_release(page); | ||
359 | continue; | ||
360 | } | ||
361 | } | ||
362 | err = write_one_page(page, 0); | ||
363 | if (err) | ||
364 | werr = err; | ||
365 | page_cache_release(page); | ||
366 | } | ||
367 | } | ||
368 | while (1) { | ||
369 | ret = find_first_extent_bit(dirty_pages, 0, &start, &end, | ||
370 | EXTENT_DIRTY); | ||
371 | if (ret) | ||
372 | break; | ||
373 | |||
374 | clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); | ||
375 | while (start <= end) { | ||
376 | index = start >> PAGE_CACHE_SHIFT; | ||
377 | start = (u64)(index + 1) << PAGE_CACHE_SHIFT; | ||
378 | page = find_get_page(btree_inode->i_mapping, index); | ||
379 | if (!page) | ||
380 | continue; | ||
381 | if (PageDirty(page)) { | ||
382 | btree_lock_page_hook(page); | ||
383 | wait_on_page_writeback(page); | ||
384 | err = write_one_page(page, 0); | ||
385 | if (err) | ||
386 | werr = err; | ||
387 | } | ||
388 | wait_on_page_writeback(page); | ||
389 | page_cache_release(page); | ||
390 | cond_resched(); | ||
391 | } | ||
392 | } | ||
393 | if (err) | ||
394 | werr = err; | ||
395 | return werr; | ||
396 | } | ||
397 | |||
398 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||
399 | struct btrfs_root *root) | ||
400 | { | ||
401 | if (!trans || !trans->transaction) { | ||
402 | struct inode *btree_inode; | ||
403 | btree_inode = root->fs_info->btree_inode; | ||
404 | return filemap_write_and_wait(btree_inode->i_mapping); | ||
405 | } | ||
406 | return btrfs_write_and_wait_marked_extents(root, | ||
407 | &trans->transaction->dirty_pages); | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * this is used to update the root pointer in the tree of tree roots. | ||
412 | * | ||
413 | * But, in the case of the extent allocation tree, updating the root | ||
414 | * pointer may allocate blocks which may change the root of the extent | ||
415 | * allocation tree. | ||
416 | * | ||
417 | * So, this loops and repeats and makes sure the cowonly root didn't | ||
418 | * change while the root pointer was being updated in the metadata. | ||
419 | */ | ||
420 | static int update_cowonly_root(struct btrfs_trans_handle *trans, | ||
421 | struct btrfs_root *root) | ||
422 | { | ||
423 | int ret; | ||
424 | u64 old_root_bytenr; | ||
425 | struct btrfs_root *tree_root = root->fs_info->tree_root; | ||
426 | |||
427 | btrfs_extent_post_op(trans, root); | ||
428 | btrfs_write_dirty_block_groups(trans, root); | ||
429 | btrfs_extent_post_op(trans, root); | ||
430 | |||
431 | while (1) { | ||
432 | old_root_bytenr = btrfs_root_bytenr(&root->root_item); | ||
433 | if (old_root_bytenr == root->node->start) | ||
434 | break; | ||
435 | btrfs_set_root_bytenr(&root->root_item, | ||
436 | root->node->start); | ||
437 | btrfs_set_root_level(&root->root_item, | ||
438 | btrfs_header_level(root->node)); | ||
439 | btrfs_set_root_generation(&root->root_item, trans->transid); | ||
440 | |||
441 | btrfs_extent_post_op(trans, root); | ||
442 | |||
443 | ret = btrfs_update_root(trans, tree_root, | ||
444 | &root->root_key, | ||
445 | &root->root_item); | ||
446 | BUG_ON(ret); | ||
447 | btrfs_write_dirty_block_groups(trans, root); | ||
448 | btrfs_extent_post_op(trans, root); | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * update all the cowonly tree roots on disk | ||
455 | */ | ||
456 | int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, | ||
457 | struct btrfs_root *root) | ||
458 | { | ||
459 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
460 | struct list_head *next; | ||
461 | struct extent_buffer *eb; | ||
462 | |||
463 | btrfs_extent_post_op(trans, fs_info->tree_root); | ||
464 | |||
465 | eb = btrfs_lock_root_node(fs_info->tree_root); | ||
466 | btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); | ||
467 | btrfs_tree_unlock(eb); | ||
468 | free_extent_buffer(eb); | ||
469 | |||
470 | btrfs_extent_post_op(trans, fs_info->tree_root); | ||
471 | |||
472 | while (!list_empty(&fs_info->dirty_cowonly_roots)) { | ||
473 | next = fs_info->dirty_cowonly_roots.next; | ||
474 | list_del_init(next); | ||
475 | root = list_entry(next, struct btrfs_root, dirty_list); | ||
476 | |||
477 | update_cowonly_root(trans, root); | ||
478 | } | ||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * dead roots are old snapshots that need to be deleted. This allocates | ||
484 | * a dirty root struct and adds it into the list of dead roots that need to | ||
485 | * be deleted | ||
486 | */ | ||
487 | int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) | ||
488 | { | ||
489 | struct btrfs_dirty_root *dirty; | ||
490 | |||
491 | dirty = kmalloc(sizeof(*dirty), GFP_NOFS); | ||
492 | if (!dirty) | ||
493 | return -ENOMEM; | ||
494 | dirty->root = root; | ||
495 | dirty->latest_root = latest; | ||
496 | |||
497 | mutex_lock(&root->fs_info->trans_mutex); | ||
498 | list_add(&dirty->list, &latest->fs_info->dead_roots); | ||
499 | mutex_unlock(&root->fs_info->trans_mutex); | ||
500 | return 0; | ||
501 | } | ||
502 | |||
503 | /* | ||
504 | * at transaction commit time we need to schedule the old roots for | ||
505 | * deletion via btrfs_drop_snapshot. This runs through all the | ||
506 | * reference counted roots that were modified in the current | ||
507 | * transaction and puts them into the drop list | ||
508 | */ | ||
509 | static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, | ||
510 | struct radix_tree_root *radix, | ||
511 | struct list_head *list) | ||
512 | { | ||
513 | struct btrfs_dirty_root *dirty; | ||
514 | struct btrfs_root *gang[8]; | ||
515 | struct btrfs_root *root; | ||
516 | int i; | ||
517 | int ret; | ||
518 | int err = 0; | ||
519 | u32 refs; | ||
520 | |||
521 | while (1) { | ||
522 | ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, | ||
523 | ARRAY_SIZE(gang), | ||
524 | BTRFS_ROOT_TRANS_TAG); | ||
525 | if (ret == 0) | ||
526 | break; | ||
527 | for (i = 0; i < ret; i++) { | ||
528 | root = gang[i]; | ||
529 | radix_tree_tag_clear(radix, | ||
530 | (unsigned long)root->root_key.objectid, | ||
531 | BTRFS_ROOT_TRANS_TAG); | ||
532 | |||
533 | BUG_ON(!root->ref_tree); | ||
534 | dirty = root->dirty_root; | ||
535 | |||
536 | btrfs_free_log(trans, root); | ||
537 | btrfs_free_reloc_root(trans, root); | ||
538 | |||
539 | if (root->commit_root == root->node) { | ||
540 | WARN_ON(root->node->start != | ||
541 | btrfs_root_bytenr(&root->root_item)); | ||
542 | |||
543 | free_extent_buffer(root->commit_root); | ||
544 | root->commit_root = NULL; | ||
545 | root->dirty_root = NULL; | ||
546 | |||
547 | spin_lock(&root->list_lock); | ||
548 | list_del_init(&dirty->root->dead_list); | ||
549 | spin_unlock(&root->list_lock); | ||
550 | |||
551 | kfree(dirty->root); | ||
552 | kfree(dirty); | ||
553 | |||
554 | /* make sure to update the root on disk | ||
555 | * so we get any updates to the block used | ||
556 | * counts | ||
557 | */ | ||
558 | err = btrfs_update_root(trans, | ||
559 | root->fs_info->tree_root, | ||
560 | &root->root_key, | ||
561 | &root->root_item); | ||
562 | continue; | ||
563 | } | ||
564 | |||
565 | memset(&root->root_item.drop_progress, 0, | ||
566 | sizeof(struct btrfs_disk_key)); | ||
567 | root->root_item.drop_level = 0; | ||
568 | root->commit_root = NULL; | ||
569 | root->dirty_root = NULL; | ||
570 | root->root_key.offset = root->fs_info->generation; | ||
571 | btrfs_set_root_bytenr(&root->root_item, | ||
572 | root->node->start); | ||
573 | btrfs_set_root_level(&root->root_item, | ||
574 | btrfs_header_level(root->node)); | ||
575 | btrfs_set_root_generation(&root->root_item, | ||
576 | root->root_key.offset); | ||
577 | |||
578 | err = btrfs_insert_root(trans, root->fs_info->tree_root, | ||
579 | &root->root_key, | ||
580 | &root->root_item); | ||
581 | if (err) | ||
582 | break; | ||
583 | |||
584 | refs = btrfs_root_refs(&dirty->root->root_item); | ||
585 | btrfs_set_root_refs(&dirty->root->root_item, refs - 1); | ||
586 | err = btrfs_update_root(trans, root->fs_info->tree_root, | ||
587 | &dirty->root->root_key, | ||
588 | &dirty->root->root_item); | ||
589 | |||
590 | BUG_ON(err); | ||
591 | if (refs == 1) { | ||
592 | list_add(&dirty->list, list); | ||
593 | } else { | ||
594 | WARN_ON(1); | ||
595 | free_extent_buffer(dirty->root->node); | ||
596 | kfree(dirty->root); | ||
597 | kfree(dirty); | ||
598 | } | ||
599 | } | ||
600 | } | ||
601 | return err; | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * defrag a given btree. If cacheonly == 1, this won't read from the disk, | ||
606 | * otherwise every leaf in the btree is read and defragged. | ||
607 | */ | ||
608 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | ||
609 | { | ||
610 | struct btrfs_fs_info *info = root->fs_info; | ||
611 | int ret; | ||
612 | struct btrfs_trans_handle *trans; | ||
613 | unsigned long nr; | ||
614 | |||
615 | smp_mb(); | ||
616 | if (root->defrag_running) | ||
617 | return 0; | ||
618 | trans = btrfs_start_transaction(root, 1); | ||
619 | while (1) { | ||
620 | root->defrag_running = 1; | ||
621 | ret = btrfs_defrag_leaves(trans, root, cacheonly); | ||
622 | nr = trans->blocks_used; | ||
623 | btrfs_end_transaction(trans, root); | ||
624 | btrfs_btree_balance_dirty(info->tree_root, nr); | ||
625 | cond_resched(); | ||
626 | |||
627 | trans = btrfs_start_transaction(root, 1); | ||
628 | if (root->fs_info->closing || ret != -EAGAIN) | ||
629 | break; | ||
630 | } | ||
631 | root->defrag_running = 0; | ||
632 | smp_mb(); | ||
633 | btrfs_end_transaction(trans, root); | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on | ||
639 | * all of them | ||
640 | */ | ||
641 | static noinline int drop_dirty_roots(struct btrfs_root *tree_root, | ||
642 | struct list_head *list) | ||
643 | { | ||
644 | struct btrfs_dirty_root *dirty; | ||
645 | struct btrfs_trans_handle *trans; | ||
646 | unsigned long nr; | ||
647 | u64 num_bytes; | ||
648 | u64 bytes_used; | ||
649 | u64 max_useless; | ||
650 | int ret = 0; | ||
651 | int err; | ||
652 | |||
653 | while (!list_empty(list)) { | ||
654 | struct btrfs_root *root; | ||
655 | |||
656 | dirty = list_entry(list->prev, struct btrfs_dirty_root, list); | ||
657 | list_del_init(&dirty->list); | ||
658 | |||
659 | num_bytes = btrfs_root_used(&dirty->root->root_item); | ||
660 | root = dirty->latest_root; | ||
661 | atomic_inc(&root->fs_info->throttles); | ||
662 | |||
663 | while (1) { | ||
664 | trans = btrfs_start_transaction(tree_root, 1); | ||
665 | mutex_lock(&root->fs_info->drop_mutex); | ||
666 | ret = btrfs_drop_snapshot(trans, dirty->root); | ||
667 | if (ret != -EAGAIN) | ||
668 | break; | ||
669 | mutex_unlock(&root->fs_info->drop_mutex); | ||
670 | |||
671 | err = btrfs_update_root(trans, | ||
672 | tree_root, | ||
673 | &dirty->root->root_key, | ||
674 | &dirty->root->root_item); | ||
675 | if (err) | ||
676 | ret = err; | ||
677 | nr = trans->blocks_used; | ||
678 | ret = btrfs_end_transaction(trans, tree_root); | ||
679 | BUG_ON(ret); | ||
680 | |||
681 | btrfs_btree_balance_dirty(tree_root, nr); | ||
682 | cond_resched(); | ||
683 | } | ||
684 | BUG_ON(ret); | ||
685 | atomic_dec(&root->fs_info->throttles); | ||
686 | wake_up(&root->fs_info->transaction_throttle); | ||
687 | |||
688 | num_bytes -= btrfs_root_used(&dirty->root->root_item); | ||
689 | bytes_used = btrfs_root_used(&root->root_item); | ||
690 | if (num_bytes) { | ||
691 | btrfs_record_root_in_trans(root); | ||
692 | btrfs_set_root_used(&root->root_item, | ||
693 | bytes_used - num_bytes); | ||
694 | } | ||
695 | |||
696 | ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); | ||
697 | if (ret) { | ||
698 | BUG(); | ||
699 | break; | ||
700 | } | ||
701 | mutex_unlock(&root->fs_info->drop_mutex); | ||
702 | |||
703 | spin_lock(&root->list_lock); | ||
704 | list_del_init(&dirty->root->dead_list); | ||
705 | if (!list_empty(&root->dead_list)) { | ||
706 | struct btrfs_root *oldest; | ||
707 | oldest = list_entry(root->dead_list.prev, | ||
708 | struct btrfs_root, dead_list); | ||
709 | max_useless = oldest->root_key.offset - 1; | ||
710 | } else { | ||
711 | max_useless = root->root_key.offset - 1; | ||
712 | } | ||
713 | spin_unlock(&root->list_lock); | ||
714 | |||
715 | nr = trans->blocks_used; | ||
716 | ret = btrfs_end_transaction(trans, tree_root); | ||
717 | BUG_ON(ret); | ||
718 | |||
719 | ret = btrfs_remove_leaf_refs(root, max_useless, 0); | ||
720 | BUG_ON(ret); | ||
721 | |||
722 | free_extent_buffer(dirty->root->node); | ||
723 | kfree(dirty->root); | ||
724 | kfree(dirty); | ||
725 | |||
726 | btrfs_btree_balance_dirty(tree_root, nr); | ||
727 | cond_resched(); | ||
728 | } | ||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * new snapshots need to be created at a very specific time in the | ||
734 | * transaction commit. This does the actual creation | ||
735 | */ | ||
736 | static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | ||
737 | struct btrfs_fs_info *fs_info, | ||
738 | struct btrfs_pending_snapshot *pending) | ||
739 | { | ||
740 | struct btrfs_key key; | ||
741 | struct btrfs_root_item *new_root_item; | ||
742 | struct btrfs_root *tree_root = fs_info->tree_root; | ||
743 | struct btrfs_root *root = pending->root; | ||
744 | struct extent_buffer *tmp; | ||
745 | struct extent_buffer *old; | ||
746 | int ret; | ||
747 | u64 objectid; | ||
748 | |||
749 | new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); | ||
750 | if (!new_root_item) { | ||
751 | ret = -ENOMEM; | ||
752 | goto fail; | ||
753 | } | ||
754 | ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); | ||
755 | if (ret) | ||
756 | goto fail; | ||
757 | |||
758 | btrfs_record_root_in_trans(root); | ||
759 | btrfs_set_root_last_snapshot(&root->root_item, trans->transid); | ||
760 | memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); | ||
761 | |||
762 | key.objectid = objectid; | ||
763 | key.offset = trans->transid; | ||
764 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
765 | |||
766 | old = btrfs_lock_root_node(root); | ||
767 | btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); | ||
768 | |||
769 | btrfs_copy_root(trans, root, old, &tmp, objectid); | ||
770 | btrfs_tree_unlock(old); | ||
771 | free_extent_buffer(old); | ||
772 | |||
773 | btrfs_set_root_bytenr(new_root_item, tmp->start); | ||
774 | btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); | ||
775 | btrfs_set_root_generation(new_root_item, trans->transid); | ||
776 | ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, | ||
777 | new_root_item); | ||
778 | btrfs_tree_unlock(tmp); | ||
779 | free_extent_buffer(tmp); | ||
780 | if (ret) | ||
781 | goto fail; | ||
782 | |||
783 | key.offset = (u64)-1; | ||
784 | memcpy(&pending->root_key, &key, sizeof(key)); | ||
785 | fail: | ||
786 | kfree(new_root_item); | ||
787 | return ret; | ||
788 | } | ||
789 | |||
790 | static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, | ||
791 | struct btrfs_pending_snapshot *pending) | ||
792 | { | ||
793 | int ret; | ||
794 | int namelen; | ||
795 | u64 index = 0; | ||
796 | struct btrfs_trans_handle *trans; | ||
797 | struct inode *parent_inode; | ||
798 | struct inode *inode; | ||
799 | struct btrfs_root *parent_root; | ||
800 | |||
801 | parent_inode = pending->dentry->d_parent->d_inode; | ||
802 | parent_root = BTRFS_I(parent_inode)->root; | ||
803 | trans = btrfs_join_transaction(parent_root, 1); | ||
804 | |||
805 | /* | ||
806 | * insert the directory item | ||
807 | */ | ||
808 | namelen = strlen(pending->name); | ||
809 | ret = btrfs_set_inode_index(parent_inode, &index); | ||
810 | ret = btrfs_insert_dir_item(trans, parent_root, | ||
811 | pending->name, namelen, | ||
812 | parent_inode->i_ino, | ||
813 | &pending->root_key, BTRFS_FT_DIR, index); | ||
814 | |||
815 | if (ret) | ||
816 | goto fail; | ||
817 | |||
818 | btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); | ||
819 | ret = btrfs_update_inode(trans, parent_root, parent_inode); | ||
820 | BUG_ON(ret); | ||
821 | |||
822 | /* add the backref first */ | ||
823 | ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, | ||
824 | pending->root_key.objectid, | ||
825 | BTRFS_ROOT_BACKREF_KEY, | ||
826 | parent_root->root_key.objectid, | ||
827 | parent_inode->i_ino, index, pending->name, | ||
828 | namelen); | ||
829 | |||
830 | BUG_ON(ret); | ||
831 | |||
832 | /* now add the forward ref */ | ||
833 | ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, | ||
834 | parent_root->root_key.objectid, | ||
835 | BTRFS_ROOT_REF_KEY, | ||
836 | pending->root_key.objectid, | ||
837 | parent_inode->i_ino, index, pending->name, | ||
838 | namelen); | ||
839 | |||
840 | inode = btrfs_lookup_dentry(parent_inode, pending->dentry); | ||
841 | d_instantiate(pending->dentry, inode); | ||
842 | fail: | ||
843 | btrfs_end_transaction(trans, fs_info->fs_root); | ||
844 | return ret; | ||
845 | } | ||
846 | |||
847 | /* | ||
848 | * create all the snapshots we've scheduled for creation | ||
849 | */ | ||
850 | static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, | ||
851 | struct btrfs_fs_info *fs_info) | ||
852 | { | ||
853 | struct btrfs_pending_snapshot *pending; | ||
854 | struct list_head *head = &trans->transaction->pending_snapshots; | ||
855 | struct list_head *cur; | ||
856 | int ret; | ||
857 | |||
858 | list_for_each(cur, head) { | ||
859 | pending = list_entry(cur, struct btrfs_pending_snapshot, list); | ||
860 | ret = create_pending_snapshot(trans, fs_info, pending); | ||
861 | BUG_ON(ret); | ||
862 | } | ||
863 | return 0; | ||
864 | } | ||
865 | |||
866 | static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, | ||
867 | struct btrfs_fs_info *fs_info) | ||
868 | { | ||
869 | struct btrfs_pending_snapshot *pending; | ||
870 | struct list_head *head = &trans->transaction->pending_snapshots; | ||
871 | int ret; | ||
872 | |||
873 | while (!list_empty(head)) { | ||
874 | pending = list_entry(head->next, | ||
875 | struct btrfs_pending_snapshot, list); | ||
876 | ret = finish_pending_snapshot(fs_info, pending); | ||
877 | BUG_ON(ret); | ||
878 | list_del(&pending->list); | ||
879 | kfree(pending->name); | ||
880 | kfree(pending); | ||
881 | } | ||
882 | return 0; | ||
883 | } | ||
884 | |||
885 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | ||
886 | struct btrfs_root *root) | ||
887 | { | ||
888 | unsigned long joined = 0; | ||
889 | unsigned long timeout = 1; | ||
890 | struct btrfs_transaction *cur_trans; | ||
891 | struct btrfs_transaction *prev_trans = NULL; | ||
892 | struct btrfs_root *chunk_root = root->fs_info->chunk_root; | ||
893 | struct list_head dirty_fs_roots; | ||
894 | struct extent_io_tree *pinned_copy; | ||
895 | DEFINE_WAIT(wait); | ||
896 | int ret; | ||
897 | |||
898 | INIT_LIST_HEAD(&dirty_fs_roots); | ||
899 | mutex_lock(&root->fs_info->trans_mutex); | ||
900 | if (trans->transaction->in_commit) { | ||
901 | cur_trans = trans->transaction; | ||
902 | trans->transaction->use_count++; | ||
903 | mutex_unlock(&root->fs_info->trans_mutex); | ||
904 | btrfs_end_transaction(trans, root); | ||
905 | |||
906 | ret = wait_for_commit(root, cur_trans); | ||
907 | BUG_ON(ret); | ||
908 | |||
909 | mutex_lock(&root->fs_info->trans_mutex); | ||
910 | put_transaction(cur_trans); | ||
911 | mutex_unlock(&root->fs_info->trans_mutex); | ||
912 | |||
913 | return 0; | ||
914 | } | ||
915 | |||
916 | pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); | ||
917 | if (!pinned_copy) | ||
918 | return -ENOMEM; | ||
919 | |||
920 | extent_io_tree_init(pinned_copy, | ||
921 | root->fs_info->btree_inode->i_mapping, GFP_NOFS); | ||
922 | |||
923 | trans->transaction->in_commit = 1; | ||
924 | trans->transaction->blocked = 1; | ||
925 | cur_trans = trans->transaction; | ||
926 | if (cur_trans->list.prev != &root->fs_info->trans_list) { | ||
927 | prev_trans = list_entry(cur_trans->list.prev, | ||
928 | struct btrfs_transaction, list); | ||
929 | if (!prev_trans->commit_done) { | ||
930 | prev_trans->use_count++; | ||
931 | mutex_unlock(&root->fs_info->trans_mutex); | ||
932 | |||
933 | wait_for_commit(root, prev_trans); | ||
934 | |||
935 | mutex_lock(&root->fs_info->trans_mutex); | ||
936 | put_transaction(prev_trans); | ||
937 | } | ||
938 | } | ||
939 | |||
940 | do { | ||
941 | int snap_pending = 0; | ||
942 | joined = cur_trans->num_joined; | ||
943 | if (!list_empty(&trans->transaction->pending_snapshots)) | ||
944 | snap_pending = 1; | ||
945 | |||
946 | WARN_ON(cur_trans != trans->transaction); | ||
947 | prepare_to_wait(&cur_trans->writer_wait, &wait, | ||
948 | TASK_UNINTERRUPTIBLE); | ||
949 | |||
950 | if (cur_trans->num_writers > 1) | ||
951 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
952 | else | ||
953 | timeout = 1; | ||
954 | |||
955 | mutex_unlock(&root->fs_info->trans_mutex); | ||
956 | |||
957 | if (snap_pending) { | ||
958 | ret = btrfs_wait_ordered_extents(root, 1); | ||
959 | BUG_ON(ret); | ||
960 | } | ||
961 | |||
962 | schedule_timeout(timeout); | ||
963 | |||
964 | mutex_lock(&root->fs_info->trans_mutex); | ||
965 | finish_wait(&cur_trans->writer_wait, &wait); | ||
966 | } while (cur_trans->num_writers > 1 || | ||
967 | (cur_trans->num_joined != joined)); | ||
968 | |||
969 | ret = create_pending_snapshots(trans, root->fs_info); | ||
970 | BUG_ON(ret); | ||
971 | |||
972 | WARN_ON(cur_trans != trans->transaction); | ||
973 | |||
974 | /* btrfs_commit_tree_roots is responsible for getting the | ||
975 | * various roots consistent with each other. Every pointer | ||
976 | * in the tree of tree roots has to point to the most up to date | ||
977 | * root for every subvolume and other tree. So, we have to keep | ||
978 | * the tree logging code from jumping in and changing any | ||
979 | * of the trees. | ||
980 | * | ||
981 | * At this point in the commit, there can't be any tree-log | ||
982 | * writers, but a little lower down we drop the trans mutex | ||
983 | * and let new people in. By holding the tree_log_mutex | ||
984 | * from now until after the super is written, we avoid races | ||
985 | * with the tree-log code. | ||
986 | */ | ||
987 | mutex_lock(&root->fs_info->tree_log_mutex); | ||
988 | /* | ||
989 | * keep tree reloc code from adding new reloc trees | ||
990 | */ | ||
991 | mutex_lock(&root->fs_info->tree_reloc_mutex); | ||
992 | |||
993 | |||
994 | ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, | ||
995 | &dirty_fs_roots); | ||
996 | BUG_ON(ret); | ||
997 | |||
998 | /* add_dirty_roots gets rid of all the tree log roots, it is now | ||
999 | * safe to free the root of tree log roots | ||
1000 | */ | ||
1001 | btrfs_free_log_root_tree(trans, root->fs_info); | ||
1002 | |||
1003 | ret = btrfs_commit_tree_roots(trans, root); | ||
1004 | BUG_ON(ret); | ||
1005 | |||
1006 | cur_trans = root->fs_info->running_transaction; | ||
1007 | spin_lock(&root->fs_info->new_trans_lock); | ||
1008 | root->fs_info->running_transaction = NULL; | ||
1009 | spin_unlock(&root->fs_info->new_trans_lock); | ||
1010 | btrfs_set_super_generation(&root->fs_info->super_copy, | ||
1011 | cur_trans->transid); | ||
1012 | btrfs_set_super_root(&root->fs_info->super_copy, | ||
1013 | root->fs_info->tree_root->node->start); | ||
1014 | btrfs_set_super_root_level(&root->fs_info->super_copy, | ||
1015 | btrfs_header_level(root->fs_info->tree_root->node)); | ||
1016 | |||
1017 | btrfs_set_super_chunk_root(&root->fs_info->super_copy, | ||
1018 | chunk_root->node->start); | ||
1019 | btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, | ||
1020 | btrfs_header_level(chunk_root->node)); | ||
1021 | btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, | ||
1022 | btrfs_header_generation(chunk_root->node)); | ||
1023 | |||
1024 | if (!root->fs_info->log_root_recovering) { | ||
1025 | btrfs_set_super_log_root(&root->fs_info->super_copy, 0); | ||
1026 | btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); | ||
1027 | } | ||
1028 | |||
1029 | memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, | ||
1030 | sizeof(root->fs_info->super_copy)); | ||
1031 | |||
1032 | btrfs_copy_pinned(root, pinned_copy); | ||
1033 | |||
1034 | trans->transaction->blocked = 0; | ||
1035 | wake_up(&root->fs_info->transaction_throttle); | ||
1036 | wake_up(&root->fs_info->transaction_wait); | ||
1037 | |||
1038 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1039 | ret = btrfs_write_and_wait_transaction(trans, root); | ||
1040 | BUG_ON(ret); | ||
1041 | write_ctree_super(trans, root, 0); | ||
1042 | |||
1043 | /* | ||
1044 | * the super is written, we can safely allow the tree-loggers | ||
1045 | * to go about their business | ||
1046 | */ | ||
1047 | mutex_unlock(&root->fs_info->tree_log_mutex); | ||
1048 | |||
1049 | btrfs_finish_extent_commit(trans, root, pinned_copy); | ||
1050 | kfree(pinned_copy); | ||
1051 | |||
1052 | btrfs_drop_dead_reloc_roots(root); | ||
1053 | mutex_unlock(&root->fs_info->tree_reloc_mutex); | ||
1054 | |||
1055 | /* do the directory inserts of any pending snapshot creations */ | ||
1056 | finish_pending_snapshots(trans, root->fs_info); | ||
1057 | |||
1058 | mutex_lock(&root->fs_info->trans_mutex); | ||
1059 | |||
1060 | cur_trans->commit_done = 1; | ||
1061 | root->fs_info->last_trans_committed = cur_trans->transid; | ||
1062 | wake_up(&cur_trans->commit_wait); | ||
1063 | |||
1064 | put_transaction(cur_trans); | ||
1065 | put_transaction(cur_trans); | ||
1066 | |||
1067 | list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); | ||
1068 | if (root->fs_info->closing) | ||
1069 | list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); | ||
1070 | |||
1071 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1072 | |||
1073 | kmem_cache_free(btrfs_trans_handle_cachep, trans); | ||
1074 | |||
1075 | if (root->fs_info->closing) | ||
1076 | drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); | ||
1077 | return ret; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * interface function to delete all the snapshots we have scheduled for deletion | ||
1082 | */ | ||
1083 | int btrfs_clean_old_snapshots(struct btrfs_root *root) | ||
1084 | { | ||
1085 | struct list_head dirty_roots; | ||
1086 | INIT_LIST_HEAD(&dirty_roots); | ||
1087 | again: | ||
1088 | mutex_lock(&root->fs_info->trans_mutex); | ||
1089 | list_splice_init(&root->fs_info->dead_roots, &dirty_roots); | ||
1090 | mutex_unlock(&root->fs_info->trans_mutex); | ||
1091 | |||
1092 | if (!list_empty(&dirty_roots)) { | ||
1093 | drop_dirty_roots(root, &dirty_roots); | ||
1094 | goto again; | ||
1095 | } | ||
1096 | return 0; | ||
1097 | } | ||
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 000000000000..ea292117f882 --- /dev/null +++ b/fs/btrfs/transaction.h | |||
@@ -0,0 +1,106 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_TRANSACTION__ | ||
20 | #define __BTRFS_TRANSACTION__ | ||
21 | #include "btrfs_inode.h" | ||
22 | |||
23 | struct btrfs_transaction { | ||
24 | u64 transid; | ||
25 | unsigned long num_writers; | ||
26 | unsigned long num_joined; | ||
27 | int in_commit; | ||
28 | int use_count; | ||
29 | int commit_done; | ||
30 | int blocked; | ||
31 | struct list_head list; | ||
32 | struct extent_io_tree dirty_pages; | ||
33 | unsigned long start_time; | ||
34 | wait_queue_head_t writer_wait; | ||
35 | wait_queue_head_t commit_wait; | ||
36 | struct list_head pending_snapshots; | ||
37 | }; | ||
38 | |||
39 | struct btrfs_trans_handle { | ||
40 | u64 transid; | ||
41 | unsigned long blocks_reserved; | ||
42 | unsigned long blocks_used; | ||
43 | struct btrfs_transaction *transaction; | ||
44 | u64 block_group; | ||
45 | u64 alloc_exclude_start; | ||
46 | u64 alloc_exclude_nr; | ||
47 | }; | ||
48 | |||
49 | struct btrfs_pending_snapshot { | ||
50 | struct dentry *dentry; | ||
51 | struct btrfs_root *root; | ||
52 | char *name; | ||
53 | struct btrfs_key root_key; | ||
54 | struct list_head list; | ||
55 | }; | ||
56 | |||
57 | struct btrfs_dirty_root { | ||
58 | struct list_head list; | ||
59 | struct btrfs_root *root; | ||
60 | struct btrfs_root *latest_root; | ||
61 | }; | ||
62 | |||
63 | static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, | ||
64 | struct inode *inode) | ||
65 | { | ||
66 | trans->block_group = BTRFS_I(inode)->block_group; | ||
67 | } | ||
68 | |||
69 | static inline void btrfs_update_inode_block_group( | ||
70 | struct btrfs_trans_handle *trans, | ||
71 | struct inode *inode) | ||
72 | { | ||
73 | BTRFS_I(inode)->block_group = trans->block_group; | ||
74 | } | ||
75 | |||
76 | static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, | ||
77 | struct inode *inode) | ||
78 | { | ||
79 | BTRFS_I(inode)->last_trans = trans->transaction->transid; | ||
80 | } | ||
81 | |||
82 | int btrfs_end_transaction(struct btrfs_trans_handle *trans, | ||
83 | struct btrfs_root *root); | ||
84 | struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, | ||
85 | int num_blocks); | ||
86 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, | ||
87 | int num_blocks); | ||
88 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, | ||
89 | int num_blocks); | ||
90 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||
91 | struct btrfs_root *root); | ||
92 | int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, | ||
93 | struct btrfs_root *root); | ||
94 | |||
95 | int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); | ||
96 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); | ||
97 | int btrfs_clean_old_snapshots(struct btrfs_root *root); | ||
98 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | ||
99 | struct btrfs_root *root); | ||
100 | int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, | ||
101 | struct btrfs_root *root); | ||
102 | void btrfs_throttle(struct btrfs_root *root); | ||
103 | int btrfs_record_root_in_trans(struct btrfs_root *root); | ||
104 | int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | ||
105 | struct extent_io_tree *dirty_pages); | ||
106 | #endif | ||
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 000000000000..3e8358c36165 --- /dev/null +++ b/fs/btrfs/tree-defrag.c | |||
@@ -0,0 +1,147 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include "ctree.h" | ||
21 | #include "disk-io.h" | ||
22 | #include "print-tree.h" | ||
23 | #include "transaction.h" | ||
24 | #include "locking.h" | ||
25 | |||
26 | /* defrag all the leaves in a given btree. If cache_only == 1, don't read | ||
27 | * things from disk, otherwise read all the leaves and try to get key order to | ||
28 | * better reflect disk order | ||
29 | */ | ||
30 | |||
31 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | ||
32 | struct btrfs_root *root, int cache_only) | ||
33 | { | ||
34 | struct btrfs_path *path = NULL; | ||
35 | struct btrfs_key key; | ||
36 | int ret = 0; | ||
37 | int wret; | ||
38 | int level; | ||
39 | int orig_level; | ||
40 | int is_extent = 0; | ||
41 | int next_key_ret = 0; | ||
42 | u64 last_ret = 0; | ||
43 | u64 min_trans = 0; | ||
44 | |||
45 | if (cache_only) | ||
46 | goto out; | ||
47 | |||
48 | if (root->fs_info->extent_root == root) { | ||
49 | /* | ||
50 | * there's recursion here right now in the tree locking, | ||
51 | * we can't defrag the extent root without deadlock | ||
52 | */ | ||
53 | goto out; | ||
54 | } | ||
55 | |||
56 | if (root->ref_cows == 0 && !is_extent) | ||
57 | goto out; | ||
58 | |||
59 | if (btrfs_test_opt(root, SSD)) | ||
60 | goto out; | ||
61 | |||
62 | path = btrfs_alloc_path(); | ||
63 | if (!path) | ||
64 | return -ENOMEM; | ||
65 | |||
66 | level = btrfs_header_level(root->node); | ||
67 | orig_level = level; | ||
68 | |||
69 | if (level == 0) | ||
70 | goto out; | ||
71 | |||
72 | if (root->defrag_progress.objectid == 0) { | ||
73 | struct extent_buffer *root_node; | ||
74 | u32 nritems; | ||
75 | |||
76 | root_node = btrfs_lock_root_node(root); | ||
77 | nritems = btrfs_header_nritems(root_node); | ||
78 | root->defrag_max.objectid = 0; | ||
79 | /* from above we know this is not a leaf */ | ||
80 | btrfs_node_key_to_cpu(root_node, &root->defrag_max, | ||
81 | nritems - 1); | ||
82 | btrfs_tree_unlock(root_node); | ||
83 | free_extent_buffer(root_node); | ||
84 | memset(&key, 0, sizeof(key)); | ||
85 | } else { | ||
86 | memcpy(&key, &root->defrag_progress, sizeof(key)); | ||
87 | } | ||
88 | |||
89 | path->keep_locks = 1; | ||
90 | if (cache_only) | ||
91 | min_trans = root->defrag_trans_start; | ||
92 | |||
93 | ret = btrfs_search_forward(root, &key, NULL, path, | ||
94 | cache_only, min_trans); | ||
95 | if (ret < 0) | ||
96 | goto out; | ||
97 | if (ret > 0) { | ||
98 | ret = 0; | ||
99 | goto out; | ||
100 | } | ||
101 | btrfs_release_path(root, path); | ||
102 | wret = btrfs_search_slot(trans, root, &key, path, 0, 1); | ||
103 | |||
104 | if (wret < 0) { | ||
105 | ret = wret; | ||
106 | goto out; | ||
107 | } | ||
108 | if (!path->nodes[1]) { | ||
109 | ret = 0; | ||
110 | goto out; | ||
111 | } | ||
112 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); | ||
113 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, | ||
114 | min_trans); | ||
115 | ret = btrfs_realloc_node(trans, root, | ||
116 | path->nodes[1], 0, | ||
117 | cache_only, &last_ret, | ||
118 | &root->defrag_progress); | ||
119 | WARN_ON(ret && ret != -EAGAIN); | ||
120 | if (next_key_ret == 0) { | ||
121 | memcpy(&root->defrag_progress, &key, sizeof(key)); | ||
122 | ret = -EAGAIN; | ||
123 | } | ||
124 | |||
125 | btrfs_release_path(root, path); | ||
126 | if (is_extent) | ||
127 | btrfs_extent_post_op(trans, root); | ||
128 | out: | ||
129 | if (path) | ||
130 | btrfs_free_path(path); | ||
131 | if (ret == -EAGAIN) { | ||
132 | if (root->defrag_max.objectid > root->defrag_progress.objectid) | ||
133 | goto done; | ||
134 | if (root->defrag_max.type > root->defrag_progress.type) | ||
135 | goto done; | ||
136 | if (root->defrag_max.offset > root->defrag_progress.offset) | ||
137 | goto done; | ||
138 | ret = 0; | ||
139 | } | ||
140 | done: | ||
141 | if (ret != -EAGAIN) { | ||
142 | memset(&root->defrag_progress, 0, | ||
143 | sizeof(root->defrag_progress)); | ||
144 | root->defrag_trans_start = trans->transid; | ||
145 | } | ||
146 | return ret; | ||
147 | } | ||
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 000000000000..d81cda2e077c --- /dev/null +++ b/fs/btrfs/tree-log.c | |||
@@ -0,0 +1,2898 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include "ctree.h" | ||
21 | #include "transaction.h" | ||
22 | #include "disk-io.h" | ||
23 | #include "locking.h" | ||
24 | #include "print-tree.h" | ||
25 | #include "compat.h" | ||
26 | #include "tree-log.h" | ||
27 | |||
28 | /* magic values for the inode_only field in btrfs_log_inode: | ||
29 | * | ||
30 | * LOG_INODE_ALL means to log everything | ||
31 | * LOG_INODE_EXISTS means to log just enough to recreate the inode | ||
32 | * during log replay | ||
33 | */ | ||
34 | #define LOG_INODE_ALL 0 | ||
35 | #define LOG_INODE_EXISTS 1 | ||
36 | |||
37 | /* | ||
38 | * stages for the tree walking. The first | ||
39 | * stage (0) is to only pin down the blocks we find | ||
40 | * the second stage (1) is to make sure that all the inodes | ||
41 | * we find in the log are created in the subvolume. | ||
42 | * | ||
43 | * The last stage is to deal with directories and links and extents | ||
44 | * and all the other fun semantics | ||
45 | */ | ||
46 | #define LOG_WALK_PIN_ONLY 0 | ||
47 | #define LOG_WALK_REPLAY_INODES 1 | ||
48 | #define LOG_WALK_REPLAY_ALL 2 | ||
49 | |||
50 | static int __btrfs_log_inode(struct btrfs_trans_handle *trans, | ||
51 | struct btrfs_root *root, struct inode *inode, | ||
52 | int inode_only); | ||
53 | static int link_to_fixup_dir(struct btrfs_trans_handle *trans, | ||
54 | struct btrfs_root *root, | ||
55 | struct btrfs_path *path, u64 objectid); | ||
56 | |||
57 | /* | ||
58 | * tree logging is a special write ahead log used to make sure that | ||
59 | * fsyncs and O_SYNCs can happen without doing full tree commits. | ||
60 | * | ||
61 | * Full tree commits are expensive because they require commonly | ||
62 | * modified blocks to be recowed, creating many dirty pages in the | ||
63 | * extent tree an 4x-6x higher write load than ext3. | ||
64 | * | ||
65 | * Instead of doing a tree commit on every fsync, we use the | ||
66 | * key ranges and transaction ids to find items for a given file or directory | ||
67 | * that have changed in this transaction. Those items are copied into | ||
68 | * a special tree (one per subvolume root), that tree is written to disk | ||
69 | * and then the fsync is considered complete. | ||
70 | * | ||
71 | * After a crash, items are copied out of the log-tree back into the | ||
72 | * subvolume tree. Any file data extents found are recorded in the extent | ||
73 | * allocation tree, and the log-tree freed. | ||
74 | * | ||
75 | * The log tree is read three times, once to pin down all the extents it is | ||
76 | * using in ram and once, once to create all the inodes logged in the tree | ||
77 | * and once to do all the other items. | ||
78 | */ | ||
79 | |||
80 | /* | ||
81 | * btrfs_add_log_tree adds a new per-subvolume log tree into the | ||
82 | * tree of log tree roots. This must be called with a tree log transaction | ||
83 | * running (see start_log_trans). | ||
84 | */ | ||
85 | static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, | ||
86 | struct btrfs_root *root) | ||
87 | { | ||
88 | struct btrfs_key key; | ||
89 | struct btrfs_root_item root_item; | ||
90 | struct btrfs_inode_item *inode_item; | ||
91 | struct extent_buffer *leaf; | ||
92 | struct btrfs_root *new_root = root; | ||
93 | int ret; | ||
94 | u64 objectid = root->root_key.objectid; | ||
95 | |||
96 | leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, | ||
97 | BTRFS_TREE_LOG_OBJECTID, | ||
98 | trans->transid, 0, 0, 0); | ||
99 | if (IS_ERR(leaf)) { | ||
100 | ret = PTR_ERR(leaf); | ||
101 | return ret; | ||
102 | } | ||
103 | |||
104 | btrfs_set_header_nritems(leaf, 0); | ||
105 | btrfs_set_header_level(leaf, 0); | ||
106 | btrfs_set_header_bytenr(leaf, leaf->start); | ||
107 | btrfs_set_header_generation(leaf, trans->transid); | ||
108 | btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); | ||
109 | |||
110 | write_extent_buffer(leaf, root->fs_info->fsid, | ||
111 | (unsigned long)btrfs_header_fsid(leaf), | ||
112 | BTRFS_FSID_SIZE); | ||
113 | btrfs_mark_buffer_dirty(leaf); | ||
114 | |||
115 | inode_item = &root_item.inode; | ||
116 | memset(inode_item, 0, sizeof(*inode_item)); | ||
117 | inode_item->generation = cpu_to_le64(1); | ||
118 | inode_item->size = cpu_to_le64(3); | ||
119 | inode_item->nlink = cpu_to_le32(1); | ||
120 | inode_item->nbytes = cpu_to_le64(root->leafsize); | ||
121 | inode_item->mode = cpu_to_le32(S_IFDIR | 0755); | ||
122 | |||
123 | btrfs_set_root_bytenr(&root_item, leaf->start); | ||
124 | btrfs_set_root_generation(&root_item, trans->transid); | ||
125 | btrfs_set_root_level(&root_item, 0); | ||
126 | btrfs_set_root_refs(&root_item, 0); | ||
127 | btrfs_set_root_used(&root_item, 0); | ||
128 | |||
129 | memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); | ||
130 | root_item.drop_level = 0; | ||
131 | |||
132 | btrfs_tree_unlock(leaf); | ||
133 | free_extent_buffer(leaf); | ||
134 | leaf = NULL; | ||
135 | |||
136 | btrfs_set_root_dirid(&root_item, 0); | ||
137 | |||
138 | key.objectid = BTRFS_TREE_LOG_OBJECTID; | ||
139 | key.offset = objectid; | ||
140 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
141 | ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, | ||
142 | &root_item); | ||
143 | if (ret) | ||
144 | goto fail; | ||
145 | |||
146 | new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, | ||
147 | &key); | ||
148 | BUG_ON(!new_root); | ||
149 | |||
150 | WARN_ON(root->log_root); | ||
151 | root->log_root = new_root; | ||
152 | |||
153 | /* | ||
154 | * log trees do not get reference counted because they go away | ||
155 | * before a real commit is actually done. They do store pointers | ||
156 | * to file data extents, and those reference counts still get | ||
157 | * updated (along with back refs to the log tree). | ||
158 | */ | ||
159 | new_root->ref_cows = 0; | ||
160 | new_root->last_trans = trans->transid; | ||
161 | |||
162 | /* | ||
163 | * we need to make sure the root block for this new tree | ||
164 | * is marked as dirty in the dirty_log_pages tree. This | ||
165 | * is how it gets flushed down to disk at tree log commit time. | ||
166 | * | ||
167 | * the tree logging mutex keeps others from coming in and changing | ||
168 | * the new_root->node, so we can safely access it here | ||
169 | */ | ||
170 | set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, | ||
171 | new_root->node->start + new_root->node->len - 1, | ||
172 | GFP_NOFS); | ||
173 | |||
174 | fail: | ||
175 | return ret; | ||
176 | } | ||
177 | |||
178 | /* | ||
179 | * start a sub transaction and setup the log tree | ||
180 | * this increments the log tree writer count to make the people | ||
181 | * syncing the tree wait for us to finish | ||
182 | */ | ||
183 | static int start_log_trans(struct btrfs_trans_handle *trans, | ||
184 | struct btrfs_root *root) | ||
185 | { | ||
186 | int ret; | ||
187 | mutex_lock(&root->fs_info->tree_log_mutex); | ||
188 | if (!root->fs_info->log_root_tree) { | ||
189 | ret = btrfs_init_log_root_tree(trans, root->fs_info); | ||
190 | BUG_ON(ret); | ||
191 | } | ||
192 | if (!root->log_root) { | ||
193 | ret = btrfs_add_log_tree(trans, root); | ||
194 | BUG_ON(ret); | ||
195 | } | ||
196 | atomic_inc(&root->fs_info->tree_log_writers); | ||
197 | root->fs_info->tree_log_batch++; | ||
198 | mutex_unlock(&root->fs_info->tree_log_mutex); | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * returns 0 if there was a log transaction running and we were able | ||
204 | * to join, or returns -ENOENT if there were not transactions | ||
205 | * in progress | ||
206 | */ | ||
207 | static int join_running_log_trans(struct btrfs_root *root) | ||
208 | { | ||
209 | int ret = -ENOENT; | ||
210 | |||
211 | smp_mb(); | ||
212 | if (!root->log_root) | ||
213 | return -ENOENT; | ||
214 | |||
215 | mutex_lock(&root->fs_info->tree_log_mutex); | ||
216 | if (root->log_root) { | ||
217 | ret = 0; | ||
218 | atomic_inc(&root->fs_info->tree_log_writers); | ||
219 | root->fs_info->tree_log_batch++; | ||
220 | } | ||
221 | mutex_unlock(&root->fs_info->tree_log_mutex); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * indicate we're done making changes to the log tree | ||
227 | * and wake up anyone waiting to do a sync | ||
228 | */ | ||
229 | static int end_log_trans(struct btrfs_root *root) | ||
230 | { | ||
231 | atomic_dec(&root->fs_info->tree_log_writers); | ||
232 | smp_mb(); | ||
233 | if (waitqueue_active(&root->fs_info->tree_log_wait)) | ||
234 | wake_up(&root->fs_info->tree_log_wait); | ||
235 | return 0; | ||
236 | } | ||
237 | |||
238 | |||
239 | /* | ||
240 | * the walk control struct is used to pass state down the chain when | ||
241 | * processing the log tree. The stage field tells us which part | ||
242 | * of the log tree processing we are currently doing. The others | ||
243 | * are state fields used for that specific part | ||
244 | */ | ||
245 | struct walk_control { | ||
246 | /* should we free the extent on disk when done? This is used | ||
247 | * at transaction commit time while freeing a log tree | ||
248 | */ | ||
249 | int free; | ||
250 | |||
251 | /* should we write out the extent buffer? This is used | ||
252 | * while flushing the log tree to disk during a sync | ||
253 | */ | ||
254 | int write; | ||
255 | |||
256 | /* should we wait for the extent buffer io to finish? Also used | ||
257 | * while flushing the log tree to disk for a sync | ||
258 | */ | ||
259 | int wait; | ||
260 | |||
261 | /* pin only walk, we record which extents on disk belong to the | ||
262 | * log trees | ||
263 | */ | ||
264 | int pin; | ||
265 | |||
266 | /* what stage of the replay code we're currently in */ | ||
267 | int stage; | ||
268 | |||
269 | /* the root we are currently replaying */ | ||
270 | struct btrfs_root *replay_dest; | ||
271 | |||
272 | /* the trans handle for the current replay */ | ||
273 | struct btrfs_trans_handle *trans; | ||
274 | |||
275 | /* the function that gets used to process blocks we find in the | ||
276 | * tree. Note the extent_buffer might not be up to date when it is | ||
277 | * passed in, and it must be checked or read if you need the data | ||
278 | * inside it | ||
279 | */ | ||
280 | int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, | ||
281 | struct walk_control *wc, u64 gen); | ||
282 | }; | ||
283 | |||
284 | /* | ||
285 | * process_func used to pin down extents, write them or wait on them | ||
286 | */ | ||
287 | static int process_one_buffer(struct btrfs_root *log, | ||
288 | struct extent_buffer *eb, | ||
289 | struct walk_control *wc, u64 gen) | ||
290 | { | ||
291 | if (wc->pin) { | ||
292 | mutex_lock(&log->fs_info->pinned_mutex); | ||
293 | btrfs_update_pinned_extents(log->fs_info->extent_root, | ||
294 | eb->start, eb->len, 1); | ||
295 | mutex_unlock(&log->fs_info->pinned_mutex); | ||
296 | } | ||
297 | |||
298 | if (btrfs_buffer_uptodate(eb, gen)) { | ||
299 | if (wc->write) | ||
300 | btrfs_write_tree_block(eb); | ||
301 | if (wc->wait) | ||
302 | btrfs_wait_tree_block_writeback(eb); | ||
303 | } | ||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * Item overwrite used by replay and tree logging. eb, slot and key all refer | ||
309 | * to the src data we are copying out. | ||
310 | * | ||
311 | * root is the tree we are copying into, and path is a scratch | ||
312 | * path for use in this function (it should be released on entry and | ||
313 | * will be released on exit). | ||
314 | * | ||
315 | * If the key is already in the destination tree the existing item is | ||
316 | * overwritten. If the existing item isn't big enough, it is extended. | ||
317 | * If it is too large, it is truncated. | ||
318 | * | ||
319 | * If the key isn't in the destination yet, a new item is inserted. | ||
320 | */ | ||
321 | static noinline int overwrite_item(struct btrfs_trans_handle *trans, | ||
322 | struct btrfs_root *root, | ||
323 | struct btrfs_path *path, | ||
324 | struct extent_buffer *eb, int slot, | ||
325 | struct btrfs_key *key) | ||
326 | { | ||
327 | int ret; | ||
328 | u32 item_size; | ||
329 | u64 saved_i_size = 0; | ||
330 | int save_old_i_size = 0; | ||
331 | unsigned long src_ptr; | ||
332 | unsigned long dst_ptr; | ||
333 | int overwrite_root = 0; | ||
334 | |||
335 | if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) | ||
336 | overwrite_root = 1; | ||
337 | |||
338 | item_size = btrfs_item_size_nr(eb, slot); | ||
339 | src_ptr = btrfs_item_ptr_offset(eb, slot); | ||
340 | |||
341 | /* look for the key in the destination tree */ | ||
342 | ret = btrfs_search_slot(NULL, root, key, path, 0, 0); | ||
343 | if (ret == 0) { | ||
344 | char *src_copy; | ||
345 | char *dst_copy; | ||
346 | u32 dst_size = btrfs_item_size_nr(path->nodes[0], | ||
347 | path->slots[0]); | ||
348 | if (dst_size != item_size) | ||
349 | goto insert; | ||
350 | |||
351 | if (item_size == 0) { | ||
352 | btrfs_release_path(root, path); | ||
353 | return 0; | ||
354 | } | ||
355 | dst_copy = kmalloc(item_size, GFP_NOFS); | ||
356 | src_copy = kmalloc(item_size, GFP_NOFS); | ||
357 | |||
358 | read_extent_buffer(eb, src_copy, src_ptr, item_size); | ||
359 | |||
360 | dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); | ||
361 | read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, | ||
362 | item_size); | ||
363 | ret = memcmp(dst_copy, src_copy, item_size); | ||
364 | |||
365 | kfree(dst_copy); | ||
366 | kfree(src_copy); | ||
367 | /* | ||
368 | * they have the same contents, just return, this saves | ||
369 | * us from cowing blocks in the destination tree and doing | ||
370 | * extra writes that may not have been done by a previous | ||
371 | * sync | ||
372 | */ | ||
373 | if (ret == 0) { | ||
374 | btrfs_release_path(root, path); | ||
375 | return 0; | ||
376 | } | ||
377 | |||
378 | } | ||
379 | insert: | ||
380 | btrfs_release_path(root, path); | ||
381 | /* try to insert the key into the destination tree */ | ||
382 | ret = btrfs_insert_empty_item(trans, root, path, | ||
383 | key, item_size); | ||
384 | |||
385 | /* make sure any existing item is the correct size */ | ||
386 | if (ret == -EEXIST) { | ||
387 | u32 found_size; | ||
388 | found_size = btrfs_item_size_nr(path->nodes[0], | ||
389 | path->slots[0]); | ||
390 | if (found_size > item_size) { | ||
391 | btrfs_truncate_item(trans, root, path, item_size, 1); | ||
392 | } else if (found_size < item_size) { | ||
393 | ret = btrfs_extend_item(trans, root, path, | ||
394 | item_size - found_size); | ||
395 | BUG_ON(ret); | ||
396 | } | ||
397 | } else if (ret) { | ||
398 | BUG(); | ||
399 | } | ||
400 | dst_ptr = btrfs_item_ptr_offset(path->nodes[0], | ||
401 | path->slots[0]); | ||
402 | |||
403 | /* don't overwrite an existing inode if the generation number | ||
404 | * was logged as zero. This is done when the tree logging code | ||
405 | * is just logging an inode to make sure it exists after recovery. | ||
406 | * | ||
407 | * Also, don't overwrite i_size on directories during replay. | ||
408 | * log replay inserts and removes directory items based on the | ||
409 | * state of the tree found in the subvolume, and i_size is modified | ||
410 | * as it goes | ||
411 | */ | ||
412 | if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { | ||
413 | struct btrfs_inode_item *src_item; | ||
414 | struct btrfs_inode_item *dst_item; | ||
415 | |||
416 | src_item = (struct btrfs_inode_item *)src_ptr; | ||
417 | dst_item = (struct btrfs_inode_item *)dst_ptr; | ||
418 | |||
419 | if (btrfs_inode_generation(eb, src_item) == 0) | ||
420 | goto no_copy; | ||
421 | |||
422 | if (overwrite_root && | ||
423 | S_ISDIR(btrfs_inode_mode(eb, src_item)) && | ||
424 | S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { | ||
425 | save_old_i_size = 1; | ||
426 | saved_i_size = btrfs_inode_size(path->nodes[0], | ||
427 | dst_item); | ||
428 | } | ||
429 | } | ||
430 | |||
431 | copy_extent_buffer(path->nodes[0], eb, dst_ptr, | ||
432 | src_ptr, item_size); | ||
433 | |||
434 | if (save_old_i_size) { | ||
435 | struct btrfs_inode_item *dst_item; | ||
436 | dst_item = (struct btrfs_inode_item *)dst_ptr; | ||
437 | btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); | ||
438 | } | ||
439 | |||
440 | /* make sure the generation is filled in */ | ||
441 | if (key->type == BTRFS_INODE_ITEM_KEY) { | ||
442 | struct btrfs_inode_item *dst_item; | ||
443 | dst_item = (struct btrfs_inode_item *)dst_ptr; | ||
444 | if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { | ||
445 | btrfs_set_inode_generation(path->nodes[0], dst_item, | ||
446 | trans->transid); | ||
447 | } | ||
448 | } | ||
449 | no_copy: | ||
450 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
451 | btrfs_release_path(root, path); | ||
452 | return 0; | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * simple helper to read an inode off the disk from a given root | ||
457 | * This can only be called for subvolume roots and not for the log | ||
458 | */ | ||
459 | static noinline struct inode *read_one_inode(struct btrfs_root *root, | ||
460 | u64 objectid) | ||
461 | { | ||
462 | struct inode *inode; | ||
463 | inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); | ||
464 | if (inode->i_state & I_NEW) { | ||
465 | BTRFS_I(inode)->root = root; | ||
466 | BTRFS_I(inode)->location.objectid = objectid; | ||
467 | BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; | ||
468 | BTRFS_I(inode)->location.offset = 0; | ||
469 | btrfs_read_locked_inode(inode); | ||
470 | unlock_new_inode(inode); | ||
471 | |||
472 | } | ||
473 | if (is_bad_inode(inode)) { | ||
474 | iput(inode); | ||
475 | inode = NULL; | ||
476 | } | ||
477 | return inode; | ||
478 | } | ||
479 | |||
480 | /* replays a single extent in 'eb' at 'slot' with 'key' into the | ||
481 | * subvolume 'root'. path is released on entry and should be released | ||
482 | * on exit. | ||
483 | * | ||
484 | * extents in the log tree have not been allocated out of the extent | ||
485 | * tree yet. So, this completes the allocation, taking a reference | ||
486 | * as required if the extent already exists or creating a new extent | ||
487 | * if it isn't in the extent allocation tree yet. | ||
488 | * | ||
489 | * The extent is inserted into the file, dropping any existing extents | ||
490 | * from the file that overlap the new one. | ||
491 | */ | ||
492 | static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | ||
493 | struct btrfs_root *root, | ||
494 | struct btrfs_path *path, | ||
495 | struct extent_buffer *eb, int slot, | ||
496 | struct btrfs_key *key) | ||
497 | { | ||
498 | int found_type; | ||
499 | u64 mask = root->sectorsize - 1; | ||
500 | u64 extent_end; | ||
501 | u64 alloc_hint; | ||
502 | u64 start = key->offset; | ||
503 | u64 saved_nbytes; | ||
504 | struct btrfs_file_extent_item *item; | ||
505 | struct inode *inode = NULL; | ||
506 | unsigned long size; | ||
507 | int ret = 0; | ||
508 | |||
509 | item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); | ||
510 | found_type = btrfs_file_extent_type(eb, item); | ||
511 | |||
512 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
513 | found_type == BTRFS_FILE_EXTENT_PREALLOC) | ||
514 | extent_end = start + btrfs_file_extent_num_bytes(eb, item); | ||
515 | else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
516 | size = btrfs_file_extent_inline_len(eb, item); | ||
517 | extent_end = (start + size + mask) & ~mask; | ||
518 | } else { | ||
519 | ret = 0; | ||
520 | goto out; | ||
521 | } | ||
522 | |||
523 | inode = read_one_inode(root, key->objectid); | ||
524 | if (!inode) { | ||
525 | ret = -EIO; | ||
526 | goto out; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * first check to see if we already have this extent in the | ||
531 | * file. This must be done before the btrfs_drop_extents run | ||
532 | * so we don't try to drop this extent. | ||
533 | */ | ||
534 | ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, | ||
535 | start, 0); | ||
536 | |||
537 | if (ret == 0 && | ||
538 | (found_type == BTRFS_FILE_EXTENT_REG || | ||
539 | found_type == BTRFS_FILE_EXTENT_PREALLOC)) { | ||
540 | struct btrfs_file_extent_item cmp1; | ||
541 | struct btrfs_file_extent_item cmp2; | ||
542 | struct btrfs_file_extent_item *existing; | ||
543 | struct extent_buffer *leaf; | ||
544 | |||
545 | leaf = path->nodes[0]; | ||
546 | existing = btrfs_item_ptr(leaf, path->slots[0], | ||
547 | struct btrfs_file_extent_item); | ||
548 | |||
549 | read_extent_buffer(eb, &cmp1, (unsigned long)item, | ||
550 | sizeof(cmp1)); | ||
551 | read_extent_buffer(leaf, &cmp2, (unsigned long)existing, | ||
552 | sizeof(cmp2)); | ||
553 | |||
554 | /* | ||
555 | * we already have a pointer to this exact extent, | ||
556 | * we don't have to do anything | ||
557 | */ | ||
558 | if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { | ||
559 | btrfs_release_path(root, path); | ||
560 | goto out; | ||
561 | } | ||
562 | } | ||
563 | btrfs_release_path(root, path); | ||
564 | |||
565 | saved_nbytes = inode_get_bytes(inode); | ||
566 | /* drop any overlapping extents */ | ||
567 | ret = btrfs_drop_extents(trans, root, inode, | ||
568 | start, extent_end, start, &alloc_hint); | ||
569 | BUG_ON(ret); | ||
570 | |||
571 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
572 | found_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
573 | unsigned long dest_offset; | ||
574 | struct btrfs_key ins; | ||
575 | |||
576 | ret = btrfs_insert_empty_item(trans, root, path, key, | ||
577 | sizeof(*item)); | ||
578 | BUG_ON(ret); | ||
579 | dest_offset = btrfs_item_ptr_offset(path->nodes[0], | ||
580 | path->slots[0]); | ||
581 | copy_extent_buffer(path->nodes[0], eb, dest_offset, | ||
582 | (unsigned long)item, sizeof(*item)); | ||
583 | |||
584 | ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); | ||
585 | ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); | ||
586 | ins.type = BTRFS_EXTENT_ITEM_KEY; | ||
587 | |||
588 | if (ins.objectid > 0) { | ||
589 | u64 csum_start; | ||
590 | u64 csum_end; | ||
591 | LIST_HEAD(ordered_sums); | ||
592 | /* | ||
593 | * is this extent already allocated in the extent | ||
594 | * allocation tree? If so, just add a reference | ||
595 | */ | ||
596 | ret = btrfs_lookup_extent(root, ins.objectid, | ||
597 | ins.offset); | ||
598 | if (ret == 0) { | ||
599 | ret = btrfs_inc_extent_ref(trans, root, | ||
600 | ins.objectid, ins.offset, | ||
601 | path->nodes[0]->start, | ||
602 | root->root_key.objectid, | ||
603 | trans->transid, key->objectid); | ||
604 | } else { | ||
605 | /* | ||
606 | * insert the extent pointer in the extent | ||
607 | * allocation tree | ||
608 | */ | ||
609 | ret = btrfs_alloc_logged_extent(trans, root, | ||
610 | path->nodes[0]->start, | ||
611 | root->root_key.objectid, | ||
612 | trans->transid, key->objectid, | ||
613 | &ins); | ||
614 | BUG_ON(ret); | ||
615 | } | ||
616 | btrfs_release_path(root, path); | ||
617 | |||
618 | if (btrfs_file_extent_compression(eb, item)) { | ||
619 | csum_start = ins.objectid; | ||
620 | csum_end = csum_start + ins.offset; | ||
621 | } else { | ||
622 | csum_start = ins.objectid + | ||
623 | btrfs_file_extent_offset(eb, item); | ||
624 | csum_end = csum_start + | ||
625 | btrfs_file_extent_num_bytes(eb, item); | ||
626 | } | ||
627 | |||
628 | ret = btrfs_lookup_csums_range(root->log_root, | ||
629 | csum_start, csum_end - 1, | ||
630 | &ordered_sums); | ||
631 | BUG_ON(ret); | ||
632 | while (!list_empty(&ordered_sums)) { | ||
633 | struct btrfs_ordered_sum *sums; | ||
634 | sums = list_entry(ordered_sums.next, | ||
635 | struct btrfs_ordered_sum, | ||
636 | list); | ||
637 | ret = btrfs_csum_file_blocks(trans, | ||
638 | root->fs_info->csum_root, | ||
639 | sums); | ||
640 | BUG_ON(ret); | ||
641 | list_del(&sums->list); | ||
642 | kfree(sums); | ||
643 | } | ||
644 | } else { | ||
645 | btrfs_release_path(root, path); | ||
646 | } | ||
647 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | ||
648 | /* inline extents are easy, we just overwrite them */ | ||
649 | ret = overwrite_item(trans, root, path, eb, slot, key); | ||
650 | BUG_ON(ret); | ||
651 | } | ||
652 | |||
653 | inode_set_bytes(inode, saved_nbytes); | ||
654 | btrfs_update_inode(trans, root, inode); | ||
655 | out: | ||
656 | if (inode) | ||
657 | iput(inode); | ||
658 | return ret; | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * when cleaning up conflicts between the directory names in the | ||
663 | * subvolume, directory names in the log and directory names in the | ||
664 | * inode back references, we may have to unlink inodes from directories. | ||
665 | * | ||
666 | * This is a helper function to do the unlink of a specific directory | ||
667 | * item | ||
668 | */ | ||
669 | static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, | ||
670 | struct btrfs_root *root, | ||
671 | struct btrfs_path *path, | ||
672 | struct inode *dir, | ||
673 | struct btrfs_dir_item *di) | ||
674 | { | ||
675 | struct inode *inode; | ||
676 | char *name; | ||
677 | int name_len; | ||
678 | struct extent_buffer *leaf; | ||
679 | struct btrfs_key location; | ||
680 | int ret; | ||
681 | |||
682 | leaf = path->nodes[0]; | ||
683 | |||
684 | btrfs_dir_item_key_to_cpu(leaf, di, &location); | ||
685 | name_len = btrfs_dir_name_len(leaf, di); | ||
686 | name = kmalloc(name_len, GFP_NOFS); | ||
687 | read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); | ||
688 | btrfs_release_path(root, path); | ||
689 | |||
690 | inode = read_one_inode(root, location.objectid); | ||
691 | BUG_ON(!inode); | ||
692 | |||
693 | ret = link_to_fixup_dir(trans, root, path, location.objectid); | ||
694 | BUG_ON(ret); | ||
695 | ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); | ||
696 | BUG_ON(ret); | ||
697 | kfree(name); | ||
698 | |||
699 | iput(inode); | ||
700 | return ret; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * helper function to see if a given name and sequence number found | ||
705 | * in an inode back reference are already in a directory and correctly | ||
706 | * point to this inode | ||
707 | */ | ||
708 | static noinline int inode_in_dir(struct btrfs_root *root, | ||
709 | struct btrfs_path *path, | ||
710 | u64 dirid, u64 objectid, u64 index, | ||
711 | const char *name, int name_len) | ||
712 | { | ||
713 | struct btrfs_dir_item *di; | ||
714 | struct btrfs_key location; | ||
715 | int match = 0; | ||
716 | |||
717 | di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, | ||
718 | index, name, name_len, 0); | ||
719 | if (di && !IS_ERR(di)) { | ||
720 | btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); | ||
721 | if (location.objectid != objectid) | ||
722 | goto out; | ||
723 | } else | ||
724 | goto out; | ||
725 | btrfs_release_path(root, path); | ||
726 | |||
727 | di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); | ||
728 | if (di && !IS_ERR(di)) { | ||
729 | btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); | ||
730 | if (location.objectid != objectid) | ||
731 | goto out; | ||
732 | } else | ||
733 | goto out; | ||
734 | match = 1; | ||
735 | out: | ||
736 | btrfs_release_path(root, path); | ||
737 | return match; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * helper function to check a log tree for a named back reference in | ||
742 | * an inode. This is used to decide if a back reference that is | ||
743 | * found in the subvolume conflicts with what we find in the log. | ||
744 | * | ||
745 | * inode backreferences may have multiple refs in a single item, | ||
746 | * during replay we process one reference at a time, and we don't | ||
747 | * want to delete valid links to a file from the subvolume if that | ||
748 | * link is also in the log. | ||
749 | */ | ||
750 | static noinline int backref_in_log(struct btrfs_root *log, | ||
751 | struct btrfs_key *key, | ||
752 | char *name, int namelen) | ||
753 | { | ||
754 | struct btrfs_path *path; | ||
755 | struct btrfs_inode_ref *ref; | ||
756 | unsigned long ptr; | ||
757 | unsigned long ptr_end; | ||
758 | unsigned long name_ptr; | ||
759 | int found_name_len; | ||
760 | int item_size; | ||
761 | int ret; | ||
762 | int match = 0; | ||
763 | |||
764 | path = btrfs_alloc_path(); | ||
765 | ret = btrfs_search_slot(NULL, log, key, path, 0, 0); | ||
766 | if (ret != 0) | ||
767 | goto out; | ||
768 | |||
769 | item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); | ||
770 | ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); | ||
771 | ptr_end = ptr + item_size; | ||
772 | while (ptr < ptr_end) { | ||
773 | ref = (struct btrfs_inode_ref *)ptr; | ||
774 | found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); | ||
775 | if (found_name_len == namelen) { | ||
776 | name_ptr = (unsigned long)(ref + 1); | ||
777 | ret = memcmp_extent_buffer(path->nodes[0], name, | ||
778 | name_ptr, namelen); | ||
779 | if (ret == 0) { | ||
780 | match = 1; | ||
781 | goto out; | ||
782 | } | ||
783 | } | ||
784 | ptr = (unsigned long)(ref + 1) + found_name_len; | ||
785 | } | ||
786 | out: | ||
787 | btrfs_free_path(path); | ||
788 | return match; | ||
789 | } | ||
790 | |||
791 | |||
792 | /* | ||
793 | * replay one inode back reference item found in the log tree. | ||
794 | * eb, slot and key refer to the buffer and key found in the log tree. | ||
795 | * root is the destination we are replaying into, and path is for temp | ||
796 | * use by this function. (it should be released on return). | ||
797 | */ | ||
798 | static noinline int add_inode_ref(struct btrfs_trans_handle *trans, | ||
799 | struct btrfs_root *root, | ||
800 | struct btrfs_root *log, | ||
801 | struct btrfs_path *path, | ||
802 | struct extent_buffer *eb, int slot, | ||
803 | struct btrfs_key *key) | ||
804 | { | ||
805 | struct inode *dir; | ||
806 | int ret; | ||
807 | struct btrfs_key location; | ||
808 | struct btrfs_inode_ref *ref; | ||
809 | struct btrfs_dir_item *di; | ||
810 | struct inode *inode; | ||
811 | char *name; | ||
812 | int namelen; | ||
813 | unsigned long ref_ptr; | ||
814 | unsigned long ref_end; | ||
815 | |||
816 | location.objectid = key->objectid; | ||
817 | location.type = BTRFS_INODE_ITEM_KEY; | ||
818 | location.offset = 0; | ||
819 | |||
820 | /* | ||
821 | * it is possible that we didn't log all the parent directories | ||
822 | * for a given inode. If we don't find the dir, just don't | ||
823 | * copy the back ref in. The link count fixup code will take | ||
824 | * care of the rest | ||
825 | */ | ||
826 | dir = read_one_inode(root, key->offset); | ||
827 | if (!dir) | ||
828 | return -ENOENT; | ||
829 | |||
830 | inode = read_one_inode(root, key->objectid); | ||
831 | BUG_ON(!dir); | ||
832 | |||
833 | ref_ptr = btrfs_item_ptr_offset(eb, slot); | ||
834 | ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); | ||
835 | |||
836 | again: | ||
837 | ref = (struct btrfs_inode_ref *)ref_ptr; | ||
838 | |||
839 | namelen = btrfs_inode_ref_name_len(eb, ref); | ||
840 | name = kmalloc(namelen, GFP_NOFS); | ||
841 | BUG_ON(!name); | ||
842 | |||
843 | read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); | ||
844 | |||
845 | /* if we already have a perfect match, we're done */ | ||
846 | if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, | ||
847 | btrfs_inode_ref_index(eb, ref), | ||
848 | name, namelen)) { | ||
849 | goto out; | ||
850 | } | ||
851 | |||
852 | /* | ||
853 | * look for a conflicting back reference in the metadata. | ||
854 | * if we find one we have to unlink that name of the file | ||
855 | * before we add our new link. Later on, we overwrite any | ||
856 | * existing back reference, and we don't want to create | ||
857 | * dangling pointers in the directory. | ||
858 | */ | ||
859 | conflict_again: | ||
860 | ret = btrfs_search_slot(NULL, root, key, path, 0, 0); | ||
861 | if (ret == 0) { | ||
862 | char *victim_name; | ||
863 | int victim_name_len; | ||
864 | struct btrfs_inode_ref *victim_ref; | ||
865 | unsigned long ptr; | ||
866 | unsigned long ptr_end; | ||
867 | struct extent_buffer *leaf = path->nodes[0]; | ||
868 | |||
869 | /* are we trying to overwrite a back ref for the root directory | ||
870 | * if so, just jump out, we're done | ||
871 | */ | ||
872 | if (key->objectid == key->offset) | ||
873 | goto out_nowrite; | ||
874 | |||
875 | /* check all the names in this back reference to see | ||
876 | * if they are in the log. if so, we allow them to stay | ||
877 | * otherwise they must be unlinked as a conflict | ||
878 | */ | ||
879 | ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); | ||
880 | ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); | ||
881 | while (ptr < ptr_end) { | ||
882 | victim_ref = (struct btrfs_inode_ref *)ptr; | ||
883 | victim_name_len = btrfs_inode_ref_name_len(leaf, | ||
884 | victim_ref); | ||
885 | victim_name = kmalloc(victim_name_len, GFP_NOFS); | ||
886 | BUG_ON(!victim_name); | ||
887 | |||
888 | read_extent_buffer(leaf, victim_name, | ||
889 | (unsigned long)(victim_ref + 1), | ||
890 | victim_name_len); | ||
891 | |||
892 | if (!backref_in_log(log, key, victim_name, | ||
893 | victim_name_len)) { | ||
894 | btrfs_inc_nlink(inode); | ||
895 | btrfs_release_path(root, path); | ||
896 | ret = btrfs_unlink_inode(trans, root, dir, | ||
897 | inode, victim_name, | ||
898 | victim_name_len); | ||
899 | kfree(victim_name); | ||
900 | btrfs_release_path(root, path); | ||
901 | goto conflict_again; | ||
902 | } | ||
903 | kfree(victim_name); | ||
904 | ptr = (unsigned long)(victim_ref + 1) + victim_name_len; | ||
905 | } | ||
906 | BUG_ON(ret); | ||
907 | } | ||
908 | btrfs_release_path(root, path); | ||
909 | |||
910 | /* look for a conflicting sequence number */ | ||
911 | di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, | ||
912 | btrfs_inode_ref_index(eb, ref), | ||
913 | name, namelen, 0); | ||
914 | if (di && !IS_ERR(di)) { | ||
915 | ret = drop_one_dir_item(trans, root, path, dir, di); | ||
916 | BUG_ON(ret); | ||
917 | } | ||
918 | btrfs_release_path(root, path); | ||
919 | |||
920 | |||
921 | /* look for a conflicting name */ | ||
922 | di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, | ||
923 | name, namelen, 0); | ||
924 | if (di && !IS_ERR(di)) { | ||
925 | ret = drop_one_dir_item(trans, root, path, dir, di); | ||
926 | BUG_ON(ret); | ||
927 | } | ||
928 | btrfs_release_path(root, path); | ||
929 | |||
930 | /* insert our name */ | ||
931 | ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, | ||
932 | btrfs_inode_ref_index(eb, ref)); | ||
933 | BUG_ON(ret); | ||
934 | |||
935 | btrfs_update_inode(trans, root, inode); | ||
936 | |||
937 | out: | ||
938 | ref_ptr = (unsigned long)(ref + 1) + namelen; | ||
939 | kfree(name); | ||
940 | if (ref_ptr < ref_end) | ||
941 | goto again; | ||
942 | |||
943 | /* finally write the back reference in the inode */ | ||
944 | ret = overwrite_item(trans, root, path, eb, slot, key); | ||
945 | BUG_ON(ret); | ||
946 | |||
947 | out_nowrite: | ||
948 | btrfs_release_path(root, path); | ||
949 | iput(dir); | ||
950 | iput(inode); | ||
951 | return 0; | ||
952 | } | ||
953 | |||
954 | /* | ||
955 | * There are a few corners where the link count of the file can't | ||
956 | * be properly maintained during replay. So, instead of adding | ||
957 | * lots of complexity to the log code, we just scan the backrefs | ||
958 | * for any file that has been through replay. | ||
959 | * | ||
960 | * The scan will update the link count on the inode to reflect the | ||
961 | * number of back refs found. If it goes down to zero, the iput | ||
962 | * will free the inode. | ||
963 | */ | ||
964 | static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, | ||
965 | struct btrfs_root *root, | ||
966 | struct inode *inode) | ||
967 | { | ||
968 | struct btrfs_path *path; | ||
969 | int ret; | ||
970 | struct btrfs_key key; | ||
971 | u64 nlink = 0; | ||
972 | unsigned long ptr; | ||
973 | unsigned long ptr_end; | ||
974 | int name_len; | ||
975 | |||
976 | key.objectid = inode->i_ino; | ||
977 | key.type = BTRFS_INODE_REF_KEY; | ||
978 | key.offset = (u64)-1; | ||
979 | |||
980 | path = btrfs_alloc_path(); | ||
981 | |||
982 | while (1) { | ||
983 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
984 | if (ret < 0) | ||
985 | break; | ||
986 | if (ret > 0) { | ||
987 | if (path->slots[0] == 0) | ||
988 | break; | ||
989 | path->slots[0]--; | ||
990 | } | ||
991 | btrfs_item_key_to_cpu(path->nodes[0], &key, | ||
992 | path->slots[0]); | ||
993 | if (key.objectid != inode->i_ino || | ||
994 | key.type != BTRFS_INODE_REF_KEY) | ||
995 | break; | ||
996 | ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); | ||
997 | ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], | ||
998 | path->slots[0]); | ||
999 | while (ptr < ptr_end) { | ||
1000 | struct btrfs_inode_ref *ref; | ||
1001 | |||
1002 | ref = (struct btrfs_inode_ref *)ptr; | ||
1003 | name_len = btrfs_inode_ref_name_len(path->nodes[0], | ||
1004 | ref); | ||
1005 | ptr = (unsigned long)(ref + 1) + name_len; | ||
1006 | nlink++; | ||
1007 | } | ||
1008 | |||
1009 | if (key.offset == 0) | ||
1010 | break; | ||
1011 | key.offset--; | ||
1012 | btrfs_release_path(root, path); | ||
1013 | } | ||
1014 | btrfs_free_path(path); | ||
1015 | if (nlink != inode->i_nlink) { | ||
1016 | inode->i_nlink = nlink; | ||
1017 | btrfs_update_inode(trans, root, inode); | ||
1018 | } | ||
1019 | BTRFS_I(inode)->index_cnt = (u64)-1; | ||
1020 | |||
1021 | return 0; | ||
1022 | } | ||
1023 | |||
1024 | static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, | ||
1025 | struct btrfs_root *root, | ||
1026 | struct btrfs_path *path) | ||
1027 | { | ||
1028 | int ret; | ||
1029 | struct btrfs_key key; | ||
1030 | struct inode *inode; | ||
1031 | |||
1032 | key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; | ||
1033 | key.type = BTRFS_ORPHAN_ITEM_KEY; | ||
1034 | key.offset = (u64)-1; | ||
1035 | while (1) { | ||
1036 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
1037 | if (ret < 0) | ||
1038 | break; | ||
1039 | |||
1040 | if (ret == 1) { | ||
1041 | if (path->slots[0] == 0) | ||
1042 | break; | ||
1043 | path->slots[0]--; | ||
1044 | } | ||
1045 | |||
1046 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | ||
1047 | if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || | ||
1048 | key.type != BTRFS_ORPHAN_ITEM_KEY) | ||
1049 | break; | ||
1050 | |||
1051 | ret = btrfs_del_item(trans, root, path); | ||
1052 | BUG_ON(ret); | ||
1053 | |||
1054 | btrfs_release_path(root, path); | ||
1055 | inode = read_one_inode(root, key.offset); | ||
1056 | BUG_ON(!inode); | ||
1057 | |||
1058 | ret = fixup_inode_link_count(trans, root, inode); | ||
1059 | BUG_ON(ret); | ||
1060 | |||
1061 | iput(inode); | ||
1062 | |||
1063 | if (key.offset == 0) | ||
1064 | break; | ||
1065 | key.offset--; | ||
1066 | } | ||
1067 | btrfs_release_path(root, path); | ||
1068 | return 0; | ||
1069 | } | ||
1070 | |||
1071 | |||
1072 | /* | ||
1073 | * record a given inode in the fixup dir so we can check its link | ||
1074 | * count when replay is done. The link count is incremented here | ||
1075 | * so the inode won't go away until we check it | ||
1076 | */ | ||
1077 | static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, | ||
1078 | struct btrfs_root *root, | ||
1079 | struct btrfs_path *path, | ||
1080 | u64 objectid) | ||
1081 | { | ||
1082 | struct btrfs_key key; | ||
1083 | int ret = 0; | ||
1084 | struct inode *inode; | ||
1085 | |||
1086 | inode = read_one_inode(root, objectid); | ||
1087 | BUG_ON(!inode); | ||
1088 | |||
1089 | key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; | ||
1090 | btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); | ||
1091 | key.offset = objectid; | ||
1092 | |||
1093 | ret = btrfs_insert_empty_item(trans, root, path, &key, 0); | ||
1094 | |||
1095 | btrfs_release_path(root, path); | ||
1096 | if (ret == 0) { | ||
1097 | btrfs_inc_nlink(inode); | ||
1098 | btrfs_update_inode(trans, root, inode); | ||
1099 | } else if (ret == -EEXIST) { | ||
1100 | ret = 0; | ||
1101 | } else { | ||
1102 | BUG(); | ||
1103 | } | ||
1104 | iput(inode); | ||
1105 | |||
1106 | return ret; | ||
1107 | } | ||
1108 | |||
1109 | /* | ||
1110 | * when replaying the log for a directory, we only insert names | ||
1111 | * for inodes that actually exist. This means an fsync on a directory | ||
1112 | * does not implicitly fsync all the new files in it | ||
1113 | */ | ||
1114 | static noinline int insert_one_name(struct btrfs_trans_handle *trans, | ||
1115 | struct btrfs_root *root, | ||
1116 | struct btrfs_path *path, | ||
1117 | u64 dirid, u64 index, | ||
1118 | char *name, int name_len, u8 type, | ||
1119 | struct btrfs_key *location) | ||
1120 | { | ||
1121 | struct inode *inode; | ||
1122 | struct inode *dir; | ||
1123 | int ret; | ||
1124 | |||
1125 | inode = read_one_inode(root, location->objectid); | ||
1126 | if (!inode) | ||
1127 | return -ENOENT; | ||
1128 | |||
1129 | dir = read_one_inode(root, dirid); | ||
1130 | if (!dir) { | ||
1131 | iput(inode); | ||
1132 | return -EIO; | ||
1133 | } | ||
1134 | ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); | ||
1135 | |||
1136 | /* FIXME, put inode into FIXUP list */ | ||
1137 | |||
1138 | iput(inode); | ||
1139 | iput(dir); | ||
1140 | return ret; | ||
1141 | } | ||
1142 | |||
1143 | /* | ||
1144 | * take a single entry in a log directory item and replay it into | ||
1145 | * the subvolume. | ||
1146 | * | ||
1147 | * if a conflicting item exists in the subdirectory already, | ||
1148 | * the inode it points to is unlinked and put into the link count | ||
1149 | * fix up tree. | ||
1150 | * | ||
1151 | * If a name from the log points to a file or directory that does | ||
1152 | * not exist in the FS, it is skipped. fsyncs on directories | ||
1153 | * do not force down inodes inside that directory, just changes to the | ||
1154 | * names or unlinks in a directory. | ||
1155 | */ | ||
1156 | static noinline int replay_one_name(struct btrfs_trans_handle *trans, | ||
1157 | struct btrfs_root *root, | ||
1158 | struct btrfs_path *path, | ||
1159 | struct extent_buffer *eb, | ||
1160 | struct btrfs_dir_item *di, | ||
1161 | struct btrfs_key *key) | ||
1162 | { | ||
1163 | char *name; | ||
1164 | int name_len; | ||
1165 | struct btrfs_dir_item *dst_di; | ||
1166 | struct btrfs_key found_key; | ||
1167 | struct btrfs_key log_key; | ||
1168 | struct inode *dir; | ||
1169 | u8 log_type; | ||
1170 | int exists; | ||
1171 | int ret; | ||
1172 | |||
1173 | dir = read_one_inode(root, key->objectid); | ||
1174 | BUG_ON(!dir); | ||
1175 | |||
1176 | name_len = btrfs_dir_name_len(eb, di); | ||
1177 | name = kmalloc(name_len, GFP_NOFS); | ||
1178 | log_type = btrfs_dir_type(eb, di); | ||
1179 | read_extent_buffer(eb, name, (unsigned long)(di + 1), | ||
1180 | name_len); | ||
1181 | |||
1182 | btrfs_dir_item_key_to_cpu(eb, di, &log_key); | ||
1183 | exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); | ||
1184 | if (exists == 0) | ||
1185 | exists = 1; | ||
1186 | else | ||
1187 | exists = 0; | ||
1188 | btrfs_release_path(root, path); | ||
1189 | |||
1190 | if (key->type == BTRFS_DIR_ITEM_KEY) { | ||
1191 | dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, | ||
1192 | name, name_len, 1); | ||
1193 | } else if (key->type == BTRFS_DIR_INDEX_KEY) { | ||
1194 | dst_di = btrfs_lookup_dir_index_item(trans, root, path, | ||
1195 | key->objectid, | ||
1196 | key->offset, name, | ||
1197 | name_len, 1); | ||
1198 | } else { | ||
1199 | BUG(); | ||
1200 | } | ||
1201 | if (!dst_di || IS_ERR(dst_di)) { | ||
1202 | /* we need a sequence number to insert, so we only | ||
1203 | * do inserts for the BTRFS_DIR_INDEX_KEY types | ||
1204 | */ | ||
1205 | if (key->type != BTRFS_DIR_INDEX_KEY) | ||
1206 | goto out; | ||
1207 | goto insert; | ||
1208 | } | ||
1209 | |||
1210 | btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); | ||
1211 | /* the existing item matches the logged item */ | ||
1212 | if (found_key.objectid == log_key.objectid && | ||
1213 | found_key.type == log_key.type && | ||
1214 | found_key.offset == log_key.offset && | ||
1215 | btrfs_dir_type(path->nodes[0], dst_di) == log_type) { | ||
1216 | goto out; | ||
1217 | } | ||
1218 | |||
1219 | /* | ||
1220 | * don't drop the conflicting directory entry if the inode | ||
1221 | * for the new entry doesn't exist | ||
1222 | */ | ||
1223 | if (!exists) | ||
1224 | goto out; | ||
1225 | |||
1226 | ret = drop_one_dir_item(trans, root, path, dir, dst_di); | ||
1227 | BUG_ON(ret); | ||
1228 | |||
1229 | if (key->type == BTRFS_DIR_INDEX_KEY) | ||
1230 | goto insert; | ||
1231 | out: | ||
1232 | btrfs_release_path(root, path); | ||
1233 | kfree(name); | ||
1234 | iput(dir); | ||
1235 | return 0; | ||
1236 | |||
1237 | insert: | ||
1238 | btrfs_release_path(root, path); | ||
1239 | ret = insert_one_name(trans, root, path, key->objectid, key->offset, | ||
1240 | name, name_len, log_type, &log_key); | ||
1241 | |||
1242 | if (ret && ret != -ENOENT) | ||
1243 | BUG(); | ||
1244 | goto out; | ||
1245 | } | ||
1246 | |||
1247 | /* | ||
1248 | * find all the names in a directory item and reconcile them into | ||
1249 | * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than | ||
1250 | * one name in a directory item, but the same code gets used for | ||
1251 | * both directory index types | ||
1252 | */ | ||
1253 | static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, | ||
1254 | struct btrfs_root *root, | ||
1255 | struct btrfs_path *path, | ||
1256 | struct extent_buffer *eb, int slot, | ||
1257 | struct btrfs_key *key) | ||
1258 | { | ||
1259 | int ret; | ||
1260 | u32 item_size = btrfs_item_size_nr(eb, slot); | ||
1261 | struct btrfs_dir_item *di; | ||
1262 | int name_len; | ||
1263 | unsigned long ptr; | ||
1264 | unsigned long ptr_end; | ||
1265 | |||
1266 | ptr = btrfs_item_ptr_offset(eb, slot); | ||
1267 | ptr_end = ptr + item_size; | ||
1268 | while (ptr < ptr_end) { | ||
1269 | di = (struct btrfs_dir_item *)ptr; | ||
1270 | name_len = btrfs_dir_name_len(eb, di); | ||
1271 | ret = replay_one_name(trans, root, path, eb, di, key); | ||
1272 | BUG_ON(ret); | ||
1273 | ptr = (unsigned long)(di + 1); | ||
1274 | ptr += name_len; | ||
1275 | } | ||
1276 | return 0; | ||
1277 | } | ||
1278 | |||
1279 | /* | ||
1280 | * directory replay has two parts. There are the standard directory | ||
1281 | * items in the log copied from the subvolume, and range items | ||
1282 | * created in the log while the subvolume was logged. | ||
1283 | * | ||
1284 | * The range items tell us which parts of the key space the log | ||
1285 | * is authoritative for. During replay, if a key in the subvolume | ||
1286 | * directory is in a logged range item, but not actually in the log | ||
1287 | * that means it was deleted from the directory before the fsync | ||
1288 | * and should be removed. | ||
1289 | */ | ||
1290 | static noinline int find_dir_range(struct btrfs_root *root, | ||
1291 | struct btrfs_path *path, | ||
1292 | u64 dirid, int key_type, | ||
1293 | u64 *start_ret, u64 *end_ret) | ||
1294 | { | ||
1295 | struct btrfs_key key; | ||
1296 | u64 found_end; | ||
1297 | struct btrfs_dir_log_item *item; | ||
1298 | int ret; | ||
1299 | int nritems; | ||
1300 | |||
1301 | if (*start_ret == (u64)-1) | ||
1302 | return 1; | ||
1303 | |||
1304 | key.objectid = dirid; | ||
1305 | key.type = key_type; | ||
1306 | key.offset = *start_ret; | ||
1307 | |||
1308 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
1309 | if (ret < 0) | ||
1310 | goto out; | ||
1311 | if (ret > 0) { | ||
1312 | if (path->slots[0] == 0) | ||
1313 | goto out; | ||
1314 | path->slots[0]--; | ||
1315 | } | ||
1316 | if (ret != 0) | ||
1317 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | ||
1318 | |||
1319 | if (key.type != key_type || key.objectid != dirid) { | ||
1320 | ret = 1; | ||
1321 | goto next; | ||
1322 | } | ||
1323 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
1324 | struct btrfs_dir_log_item); | ||
1325 | found_end = btrfs_dir_log_end(path->nodes[0], item); | ||
1326 | |||
1327 | if (*start_ret >= key.offset && *start_ret <= found_end) { | ||
1328 | ret = 0; | ||
1329 | *start_ret = key.offset; | ||
1330 | *end_ret = found_end; | ||
1331 | goto out; | ||
1332 | } | ||
1333 | ret = 1; | ||
1334 | next: | ||
1335 | /* check the next slot in the tree to see if it is a valid item */ | ||
1336 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
1337 | if (path->slots[0] >= nritems) { | ||
1338 | ret = btrfs_next_leaf(root, path); | ||
1339 | if (ret) | ||
1340 | goto out; | ||
1341 | } else { | ||
1342 | path->slots[0]++; | ||
1343 | } | ||
1344 | |||
1345 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | ||
1346 | |||
1347 | if (key.type != key_type || key.objectid != dirid) { | ||
1348 | ret = 1; | ||
1349 | goto out; | ||
1350 | } | ||
1351 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
1352 | struct btrfs_dir_log_item); | ||
1353 | found_end = btrfs_dir_log_end(path->nodes[0], item); | ||
1354 | *start_ret = key.offset; | ||
1355 | *end_ret = found_end; | ||
1356 | ret = 0; | ||
1357 | out: | ||
1358 | btrfs_release_path(root, path); | ||
1359 | return ret; | ||
1360 | } | ||
1361 | |||
1362 | /* | ||
1363 | * this looks for a given directory item in the log. If the directory | ||
1364 | * item is not in the log, the item is removed and the inode it points | ||
1365 | * to is unlinked | ||
1366 | */ | ||
1367 | static noinline int check_item_in_log(struct btrfs_trans_handle *trans, | ||
1368 | struct btrfs_root *root, | ||
1369 | struct btrfs_root *log, | ||
1370 | struct btrfs_path *path, | ||
1371 | struct btrfs_path *log_path, | ||
1372 | struct inode *dir, | ||
1373 | struct btrfs_key *dir_key) | ||
1374 | { | ||
1375 | int ret; | ||
1376 | struct extent_buffer *eb; | ||
1377 | int slot; | ||
1378 | u32 item_size; | ||
1379 | struct btrfs_dir_item *di; | ||
1380 | struct btrfs_dir_item *log_di; | ||
1381 | int name_len; | ||
1382 | unsigned long ptr; | ||
1383 | unsigned long ptr_end; | ||
1384 | char *name; | ||
1385 | struct inode *inode; | ||
1386 | struct btrfs_key location; | ||
1387 | |||
1388 | again: | ||
1389 | eb = path->nodes[0]; | ||
1390 | slot = path->slots[0]; | ||
1391 | item_size = btrfs_item_size_nr(eb, slot); | ||
1392 | ptr = btrfs_item_ptr_offset(eb, slot); | ||
1393 | ptr_end = ptr + item_size; | ||
1394 | while (ptr < ptr_end) { | ||
1395 | di = (struct btrfs_dir_item *)ptr; | ||
1396 | name_len = btrfs_dir_name_len(eb, di); | ||
1397 | name = kmalloc(name_len, GFP_NOFS); | ||
1398 | if (!name) { | ||
1399 | ret = -ENOMEM; | ||
1400 | goto out; | ||
1401 | } | ||
1402 | read_extent_buffer(eb, name, (unsigned long)(di + 1), | ||
1403 | name_len); | ||
1404 | log_di = NULL; | ||
1405 | if (dir_key->type == BTRFS_DIR_ITEM_KEY) { | ||
1406 | log_di = btrfs_lookup_dir_item(trans, log, log_path, | ||
1407 | dir_key->objectid, | ||
1408 | name, name_len, 0); | ||
1409 | } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { | ||
1410 | log_di = btrfs_lookup_dir_index_item(trans, log, | ||
1411 | log_path, | ||
1412 | dir_key->objectid, | ||
1413 | dir_key->offset, | ||
1414 | name, name_len, 0); | ||
1415 | } | ||
1416 | if (!log_di || IS_ERR(log_di)) { | ||
1417 | btrfs_dir_item_key_to_cpu(eb, di, &location); | ||
1418 | btrfs_release_path(root, path); | ||
1419 | btrfs_release_path(log, log_path); | ||
1420 | inode = read_one_inode(root, location.objectid); | ||
1421 | BUG_ON(!inode); | ||
1422 | |||
1423 | ret = link_to_fixup_dir(trans, root, | ||
1424 | path, location.objectid); | ||
1425 | BUG_ON(ret); | ||
1426 | btrfs_inc_nlink(inode); | ||
1427 | ret = btrfs_unlink_inode(trans, root, dir, inode, | ||
1428 | name, name_len); | ||
1429 | BUG_ON(ret); | ||
1430 | kfree(name); | ||
1431 | iput(inode); | ||
1432 | |||
1433 | /* there might still be more names under this key | ||
1434 | * check and repeat if required | ||
1435 | */ | ||
1436 | ret = btrfs_search_slot(NULL, root, dir_key, path, | ||
1437 | 0, 0); | ||
1438 | if (ret == 0) | ||
1439 | goto again; | ||
1440 | ret = 0; | ||
1441 | goto out; | ||
1442 | } | ||
1443 | btrfs_release_path(log, log_path); | ||
1444 | kfree(name); | ||
1445 | |||
1446 | ptr = (unsigned long)(di + 1); | ||
1447 | ptr += name_len; | ||
1448 | } | ||
1449 | ret = 0; | ||
1450 | out: | ||
1451 | btrfs_release_path(root, path); | ||
1452 | btrfs_release_path(log, log_path); | ||
1453 | return ret; | ||
1454 | } | ||
1455 | |||
1456 | /* | ||
1457 | * deletion replay happens before we copy any new directory items | ||
1458 | * out of the log or out of backreferences from inodes. It | ||
1459 | * scans the log to find ranges of keys that log is authoritative for, | ||
1460 | * and then scans the directory to find items in those ranges that are | ||
1461 | * not present in the log. | ||
1462 | * | ||
1463 | * Anything we don't find in the log is unlinked and removed from the | ||
1464 | * directory. | ||
1465 | */ | ||
1466 | static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, | ||
1467 | struct btrfs_root *root, | ||
1468 | struct btrfs_root *log, | ||
1469 | struct btrfs_path *path, | ||
1470 | u64 dirid) | ||
1471 | { | ||
1472 | u64 range_start; | ||
1473 | u64 range_end; | ||
1474 | int key_type = BTRFS_DIR_LOG_ITEM_KEY; | ||
1475 | int ret = 0; | ||
1476 | struct btrfs_key dir_key; | ||
1477 | struct btrfs_key found_key; | ||
1478 | struct btrfs_path *log_path; | ||
1479 | struct inode *dir; | ||
1480 | |||
1481 | dir_key.objectid = dirid; | ||
1482 | dir_key.type = BTRFS_DIR_ITEM_KEY; | ||
1483 | log_path = btrfs_alloc_path(); | ||
1484 | if (!log_path) | ||
1485 | return -ENOMEM; | ||
1486 | |||
1487 | dir = read_one_inode(root, dirid); | ||
1488 | /* it isn't an error if the inode isn't there, that can happen | ||
1489 | * because we replay the deletes before we copy in the inode item | ||
1490 | * from the log | ||
1491 | */ | ||
1492 | if (!dir) { | ||
1493 | btrfs_free_path(log_path); | ||
1494 | return 0; | ||
1495 | } | ||
1496 | again: | ||
1497 | range_start = 0; | ||
1498 | range_end = 0; | ||
1499 | while (1) { | ||
1500 | ret = find_dir_range(log, path, dirid, key_type, | ||
1501 | &range_start, &range_end); | ||
1502 | if (ret != 0) | ||
1503 | break; | ||
1504 | |||
1505 | dir_key.offset = range_start; | ||
1506 | while (1) { | ||
1507 | int nritems; | ||
1508 | ret = btrfs_search_slot(NULL, root, &dir_key, path, | ||
1509 | 0, 0); | ||
1510 | if (ret < 0) | ||
1511 | goto out; | ||
1512 | |||
1513 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
1514 | if (path->slots[0] >= nritems) { | ||
1515 | ret = btrfs_next_leaf(root, path); | ||
1516 | if (ret) | ||
1517 | break; | ||
1518 | } | ||
1519 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
1520 | path->slots[0]); | ||
1521 | if (found_key.objectid != dirid || | ||
1522 | found_key.type != dir_key.type) | ||
1523 | goto next_type; | ||
1524 | |||
1525 | if (found_key.offset > range_end) | ||
1526 | break; | ||
1527 | |||
1528 | ret = check_item_in_log(trans, root, log, path, | ||
1529 | log_path, dir, &found_key); | ||
1530 | BUG_ON(ret); | ||
1531 | if (found_key.offset == (u64)-1) | ||
1532 | break; | ||
1533 | dir_key.offset = found_key.offset + 1; | ||
1534 | } | ||
1535 | btrfs_release_path(root, path); | ||
1536 | if (range_end == (u64)-1) | ||
1537 | break; | ||
1538 | range_start = range_end + 1; | ||
1539 | } | ||
1540 | |||
1541 | next_type: | ||
1542 | ret = 0; | ||
1543 | if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { | ||
1544 | key_type = BTRFS_DIR_LOG_INDEX_KEY; | ||
1545 | dir_key.type = BTRFS_DIR_INDEX_KEY; | ||
1546 | btrfs_release_path(root, path); | ||
1547 | goto again; | ||
1548 | } | ||
1549 | out: | ||
1550 | btrfs_release_path(root, path); | ||
1551 | btrfs_free_path(log_path); | ||
1552 | iput(dir); | ||
1553 | return ret; | ||
1554 | } | ||
1555 | |||
1556 | /* | ||
1557 | * the process_func used to replay items from the log tree. This | ||
1558 | * gets called in two different stages. The first stage just looks | ||
1559 | * for inodes and makes sure they are all copied into the subvolume. | ||
1560 | * | ||
1561 | * The second stage copies all the other item types from the log into | ||
1562 | * the subvolume. The two stage approach is slower, but gets rid of | ||
1563 | * lots of complexity around inodes referencing other inodes that exist | ||
1564 | * only in the log (references come from either directory items or inode | ||
1565 | * back refs). | ||
1566 | */ | ||
1567 | static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, | ||
1568 | struct walk_control *wc, u64 gen) | ||
1569 | { | ||
1570 | int nritems; | ||
1571 | struct btrfs_path *path; | ||
1572 | struct btrfs_root *root = wc->replay_dest; | ||
1573 | struct btrfs_key key; | ||
1574 | u32 item_size; | ||
1575 | int level; | ||
1576 | int i; | ||
1577 | int ret; | ||
1578 | |||
1579 | btrfs_read_buffer(eb, gen); | ||
1580 | |||
1581 | level = btrfs_header_level(eb); | ||
1582 | |||
1583 | if (level != 0) | ||
1584 | return 0; | ||
1585 | |||
1586 | path = btrfs_alloc_path(); | ||
1587 | BUG_ON(!path); | ||
1588 | |||
1589 | nritems = btrfs_header_nritems(eb); | ||
1590 | for (i = 0; i < nritems; i++) { | ||
1591 | btrfs_item_key_to_cpu(eb, &key, i); | ||
1592 | item_size = btrfs_item_size_nr(eb, i); | ||
1593 | |||
1594 | /* inode keys are done during the first stage */ | ||
1595 | if (key.type == BTRFS_INODE_ITEM_KEY && | ||
1596 | wc->stage == LOG_WALK_REPLAY_INODES) { | ||
1597 | struct inode *inode; | ||
1598 | struct btrfs_inode_item *inode_item; | ||
1599 | u32 mode; | ||
1600 | |||
1601 | inode_item = btrfs_item_ptr(eb, i, | ||
1602 | struct btrfs_inode_item); | ||
1603 | mode = btrfs_inode_mode(eb, inode_item); | ||
1604 | if (S_ISDIR(mode)) { | ||
1605 | ret = replay_dir_deletes(wc->trans, | ||
1606 | root, log, path, key.objectid); | ||
1607 | BUG_ON(ret); | ||
1608 | } | ||
1609 | ret = overwrite_item(wc->trans, root, path, | ||
1610 | eb, i, &key); | ||
1611 | BUG_ON(ret); | ||
1612 | |||
1613 | /* for regular files, truncate away | ||
1614 | * extents past the new EOF | ||
1615 | */ | ||
1616 | if (S_ISREG(mode)) { | ||
1617 | inode = read_one_inode(root, | ||
1618 | key.objectid); | ||
1619 | BUG_ON(!inode); | ||
1620 | |||
1621 | ret = btrfs_truncate_inode_items(wc->trans, | ||
1622 | root, inode, inode->i_size, | ||
1623 | BTRFS_EXTENT_DATA_KEY); | ||
1624 | BUG_ON(ret); | ||
1625 | iput(inode); | ||
1626 | } | ||
1627 | ret = link_to_fixup_dir(wc->trans, root, | ||
1628 | path, key.objectid); | ||
1629 | BUG_ON(ret); | ||
1630 | } | ||
1631 | if (wc->stage < LOG_WALK_REPLAY_ALL) | ||
1632 | continue; | ||
1633 | |||
1634 | /* these keys are simply copied */ | ||
1635 | if (key.type == BTRFS_XATTR_ITEM_KEY) { | ||
1636 | ret = overwrite_item(wc->trans, root, path, | ||
1637 | eb, i, &key); | ||
1638 | BUG_ON(ret); | ||
1639 | } else if (key.type == BTRFS_INODE_REF_KEY) { | ||
1640 | ret = add_inode_ref(wc->trans, root, log, path, | ||
1641 | eb, i, &key); | ||
1642 | BUG_ON(ret && ret != -ENOENT); | ||
1643 | } else if (key.type == BTRFS_EXTENT_DATA_KEY) { | ||
1644 | ret = replay_one_extent(wc->trans, root, path, | ||
1645 | eb, i, &key); | ||
1646 | BUG_ON(ret); | ||
1647 | } else if (key.type == BTRFS_DIR_ITEM_KEY || | ||
1648 | key.type == BTRFS_DIR_INDEX_KEY) { | ||
1649 | ret = replay_one_dir_item(wc->trans, root, path, | ||
1650 | eb, i, &key); | ||
1651 | BUG_ON(ret); | ||
1652 | } | ||
1653 | } | ||
1654 | btrfs_free_path(path); | ||
1655 | return 0; | ||
1656 | } | ||
1657 | |||
1658 | static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, | ||
1659 | struct btrfs_root *root, | ||
1660 | struct btrfs_path *path, int *level, | ||
1661 | struct walk_control *wc) | ||
1662 | { | ||
1663 | u64 root_owner; | ||
1664 | u64 root_gen; | ||
1665 | u64 bytenr; | ||
1666 | u64 ptr_gen; | ||
1667 | struct extent_buffer *next; | ||
1668 | struct extent_buffer *cur; | ||
1669 | struct extent_buffer *parent; | ||
1670 | u32 blocksize; | ||
1671 | int ret = 0; | ||
1672 | |||
1673 | WARN_ON(*level < 0); | ||
1674 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
1675 | |||
1676 | while (*level > 0) { | ||
1677 | WARN_ON(*level < 0); | ||
1678 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
1679 | cur = path->nodes[*level]; | ||
1680 | |||
1681 | if (btrfs_header_level(cur) != *level) | ||
1682 | WARN_ON(1); | ||
1683 | |||
1684 | if (path->slots[*level] >= | ||
1685 | btrfs_header_nritems(cur)) | ||
1686 | break; | ||
1687 | |||
1688 | bytenr = btrfs_node_blockptr(cur, path->slots[*level]); | ||
1689 | ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); | ||
1690 | blocksize = btrfs_level_size(root, *level - 1); | ||
1691 | |||
1692 | parent = path->nodes[*level]; | ||
1693 | root_owner = btrfs_header_owner(parent); | ||
1694 | root_gen = btrfs_header_generation(parent); | ||
1695 | |||
1696 | next = btrfs_find_create_tree_block(root, bytenr, blocksize); | ||
1697 | |||
1698 | wc->process_func(root, next, wc, ptr_gen); | ||
1699 | |||
1700 | if (*level == 1) { | ||
1701 | path->slots[*level]++; | ||
1702 | if (wc->free) { | ||
1703 | btrfs_read_buffer(next, ptr_gen); | ||
1704 | |||
1705 | btrfs_tree_lock(next); | ||
1706 | clean_tree_block(trans, root, next); | ||
1707 | btrfs_wait_tree_block_writeback(next); | ||
1708 | btrfs_tree_unlock(next); | ||
1709 | |||
1710 | ret = btrfs_drop_leaf_ref(trans, root, next); | ||
1711 | BUG_ON(ret); | ||
1712 | |||
1713 | WARN_ON(root_owner != | ||
1714 | BTRFS_TREE_LOG_OBJECTID); | ||
1715 | ret = btrfs_free_reserved_extent(root, | ||
1716 | bytenr, blocksize); | ||
1717 | BUG_ON(ret); | ||
1718 | } | ||
1719 | free_extent_buffer(next); | ||
1720 | continue; | ||
1721 | } | ||
1722 | btrfs_read_buffer(next, ptr_gen); | ||
1723 | |||
1724 | WARN_ON(*level <= 0); | ||
1725 | if (path->nodes[*level-1]) | ||
1726 | free_extent_buffer(path->nodes[*level-1]); | ||
1727 | path->nodes[*level-1] = next; | ||
1728 | *level = btrfs_header_level(next); | ||
1729 | path->slots[*level] = 0; | ||
1730 | cond_resched(); | ||
1731 | } | ||
1732 | WARN_ON(*level < 0); | ||
1733 | WARN_ON(*level >= BTRFS_MAX_LEVEL); | ||
1734 | |||
1735 | if (path->nodes[*level] == root->node) | ||
1736 | parent = path->nodes[*level]; | ||
1737 | else | ||
1738 | parent = path->nodes[*level + 1]; | ||
1739 | |||
1740 | bytenr = path->nodes[*level]->start; | ||
1741 | |||
1742 | blocksize = btrfs_level_size(root, *level); | ||
1743 | root_owner = btrfs_header_owner(parent); | ||
1744 | root_gen = btrfs_header_generation(parent); | ||
1745 | |||
1746 | wc->process_func(root, path->nodes[*level], wc, | ||
1747 | btrfs_header_generation(path->nodes[*level])); | ||
1748 | |||
1749 | if (wc->free) { | ||
1750 | next = path->nodes[*level]; | ||
1751 | btrfs_tree_lock(next); | ||
1752 | clean_tree_block(trans, root, next); | ||
1753 | btrfs_wait_tree_block_writeback(next); | ||
1754 | btrfs_tree_unlock(next); | ||
1755 | |||
1756 | if (*level == 0) { | ||
1757 | ret = btrfs_drop_leaf_ref(trans, root, next); | ||
1758 | BUG_ON(ret); | ||
1759 | } | ||
1760 | WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); | ||
1761 | ret = btrfs_free_reserved_extent(root, bytenr, blocksize); | ||
1762 | BUG_ON(ret); | ||
1763 | } | ||
1764 | free_extent_buffer(path->nodes[*level]); | ||
1765 | path->nodes[*level] = NULL; | ||
1766 | *level += 1; | ||
1767 | |||
1768 | cond_resched(); | ||
1769 | return 0; | ||
1770 | } | ||
1771 | |||
1772 | static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, | ||
1773 | struct btrfs_root *root, | ||
1774 | struct btrfs_path *path, int *level, | ||
1775 | struct walk_control *wc) | ||
1776 | { | ||
1777 | u64 root_owner; | ||
1778 | u64 root_gen; | ||
1779 | int i; | ||
1780 | int slot; | ||
1781 | int ret; | ||
1782 | |||
1783 | for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { | ||
1784 | slot = path->slots[i]; | ||
1785 | if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { | ||
1786 | struct extent_buffer *node; | ||
1787 | node = path->nodes[i]; | ||
1788 | path->slots[i]++; | ||
1789 | *level = i; | ||
1790 | WARN_ON(*level == 0); | ||
1791 | return 0; | ||
1792 | } else { | ||
1793 | struct extent_buffer *parent; | ||
1794 | if (path->nodes[*level] == root->node) | ||
1795 | parent = path->nodes[*level]; | ||
1796 | else | ||
1797 | parent = path->nodes[*level + 1]; | ||
1798 | |||
1799 | root_owner = btrfs_header_owner(parent); | ||
1800 | root_gen = btrfs_header_generation(parent); | ||
1801 | wc->process_func(root, path->nodes[*level], wc, | ||
1802 | btrfs_header_generation(path->nodes[*level])); | ||
1803 | if (wc->free) { | ||
1804 | struct extent_buffer *next; | ||
1805 | |||
1806 | next = path->nodes[*level]; | ||
1807 | |||
1808 | btrfs_tree_lock(next); | ||
1809 | clean_tree_block(trans, root, next); | ||
1810 | btrfs_wait_tree_block_writeback(next); | ||
1811 | btrfs_tree_unlock(next); | ||
1812 | |||
1813 | if (*level == 0) { | ||
1814 | ret = btrfs_drop_leaf_ref(trans, root, | ||
1815 | next); | ||
1816 | BUG_ON(ret); | ||
1817 | } | ||
1818 | |||
1819 | WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); | ||
1820 | ret = btrfs_free_reserved_extent(root, | ||
1821 | path->nodes[*level]->start, | ||
1822 | path->nodes[*level]->len); | ||
1823 | BUG_ON(ret); | ||
1824 | } | ||
1825 | free_extent_buffer(path->nodes[*level]); | ||
1826 | path->nodes[*level] = NULL; | ||
1827 | *level = i + 1; | ||
1828 | } | ||
1829 | } | ||
1830 | return 1; | ||
1831 | } | ||
1832 | |||
1833 | /* | ||
1834 | * drop the reference count on the tree rooted at 'snap'. This traverses | ||
1835 | * the tree freeing any blocks that have a ref count of zero after being | ||
1836 | * decremented. | ||
1837 | */ | ||
1838 | static int walk_log_tree(struct btrfs_trans_handle *trans, | ||
1839 | struct btrfs_root *log, struct walk_control *wc) | ||
1840 | { | ||
1841 | int ret = 0; | ||
1842 | int wret; | ||
1843 | int level; | ||
1844 | struct btrfs_path *path; | ||
1845 | int i; | ||
1846 | int orig_level; | ||
1847 | |||
1848 | path = btrfs_alloc_path(); | ||
1849 | BUG_ON(!path); | ||
1850 | |||
1851 | level = btrfs_header_level(log->node); | ||
1852 | orig_level = level; | ||
1853 | path->nodes[level] = log->node; | ||
1854 | extent_buffer_get(log->node); | ||
1855 | path->slots[level] = 0; | ||
1856 | |||
1857 | while (1) { | ||
1858 | wret = walk_down_log_tree(trans, log, path, &level, wc); | ||
1859 | if (wret > 0) | ||
1860 | break; | ||
1861 | if (wret < 0) | ||
1862 | ret = wret; | ||
1863 | |||
1864 | wret = walk_up_log_tree(trans, log, path, &level, wc); | ||
1865 | if (wret > 0) | ||
1866 | break; | ||
1867 | if (wret < 0) | ||
1868 | ret = wret; | ||
1869 | } | ||
1870 | |||
1871 | /* was the root node processed? if not, catch it here */ | ||
1872 | if (path->nodes[orig_level]) { | ||
1873 | wc->process_func(log, path->nodes[orig_level], wc, | ||
1874 | btrfs_header_generation(path->nodes[orig_level])); | ||
1875 | if (wc->free) { | ||
1876 | struct extent_buffer *next; | ||
1877 | |||
1878 | next = path->nodes[orig_level]; | ||
1879 | |||
1880 | btrfs_tree_lock(next); | ||
1881 | clean_tree_block(trans, log, next); | ||
1882 | btrfs_wait_tree_block_writeback(next); | ||
1883 | btrfs_tree_unlock(next); | ||
1884 | |||
1885 | if (orig_level == 0) { | ||
1886 | ret = btrfs_drop_leaf_ref(trans, log, | ||
1887 | next); | ||
1888 | BUG_ON(ret); | ||
1889 | } | ||
1890 | WARN_ON(log->root_key.objectid != | ||
1891 | BTRFS_TREE_LOG_OBJECTID); | ||
1892 | ret = btrfs_free_reserved_extent(log, next->start, | ||
1893 | next->len); | ||
1894 | BUG_ON(ret); | ||
1895 | } | ||
1896 | } | ||
1897 | |||
1898 | for (i = 0; i <= orig_level; i++) { | ||
1899 | if (path->nodes[i]) { | ||
1900 | free_extent_buffer(path->nodes[i]); | ||
1901 | path->nodes[i] = NULL; | ||
1902 | } | ||
1903 | } | ||
1904 | btrfs_free_path(path); | ||
1905 | if (wc->free) | ||
1906 | free_extent_buffer(log->node); | ||
1907 | return ret; | ||
1908 | } | ||
1909 | |||
1910 | static int wait_log_commit(struct btrfs_root *log) | ||
1911 | { | ||
1912 | DEFINE_WAIT(wait); | ||
1913 | u64 transid = log->fs_info->tree_log_transid; | ||
1914 | |||
1915 | do { | ||
1916 | prepare_to_wait(&log->fs_info->tree_log_wait, &wait, | ||
1917 | TASK_UNINTERRUPTIBLE); | ||
1918 | mutex_unlock(&log->fs_info->tree_log_mutex); | ||
1919 | if (atomic_read(&log->fs_info->tree_log_commit)) | ||
1920 | schedule(); | ||
1921 | finish_wait(&log->fs_info->tree_log_wait, &wait); | ||
1922 | mutex_lock(&log->fs_info->tree_log_mutex); | ||
1923 | } while (transid == log->fs_info->tree_log_transid && | ||
1924 | atomic_read(&log->fs_info->tree_log_commit)); | ||
1925 | return 0; | ||
1926 | } | ||
1927 | |||
1928 | /* | ||
1929 | * btrfs_sync_log does sends a given tree log down to the disk and | ||
1930 | * updates the super blocks to record it. When this call is done, | ||
1931 | * you know that any inodes previously logged are safely on disk | ||
1932 | */ | ||
1933 | int btrfs_sync_log(struct btrfs_trans_handle *trans, | ||
1934 | struct btrfs_root *root) | ||
1935 | { | ||
1936 | int ret; | ||
1937 | unsigned long batch; | ||
1938 | struct btrfs_root *log = root->log_root; | ||
1939 | |||
1940 | mutex_lock(&log->fs_info->tree_log_mutex); | ||
1941 | if (atomic_read(&log->fs_info->tree_log_commit)) { | ||
1942 | wait_log_commit(log); | ||
1943 | goto out; | ||
1944 | } | ||
1945 | atomic_set(&log->fs_info->tree_log_commit, 1); | ||
1946 | |||
1947 | while (1) { | ||
1948 | batch = log->fs_info->tree_log_batch; | ||
1949 | mutex_unlock(&log->fs_info->tree_log_mutex); | ||
1950 | schedule_timeout_uninterruptible(1); | ||
1951 | mutex_lock(&log->fs_info->tree_log_mutex); | ||
1952 | |||
1953 | while (atomic_read(&log->fs_info->tree_log_writers)) { | ||
1954 | DEFINE_WAIT(wait); | ||
1955 | prepare_to_wait(&log->fs_info->tree_log_wait, &wait, | ||
1956 | TASK_UNINTERRUPTIBLE); | ||
1957 | mutex_unlock(&log->fs_info->tree_log_mutex); | ||
1958 | if (atomic_read(&log->fs_info->tree_log_writers)) | ||
1959 | schedule(); | ||
1960 | mutex_lock(&log->fs_info->tree_log_mutex); | ||
1961 | finish_wait(&log->fs_info->tree_log_wait, &wait); | ||
1962 | } | ||
1963 | if (batch == log->fs_info->tree_log_batch) | ||
1964 | break; | ||
1965 | } | ||
1966 | |||
1967 | ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); | ||
1968 | BUG_ON(ret); | ||
1969 | ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, | ||
1970 | &root->fs_info->log_root_tree->dirty_log_pages); | ||
1971 | BUG_ON(ret); | ||
1972 | |||
1973 | btrfs_set_super_log_root(&root->fs_info->super_for_commit, | ||
1974 | log->fs_info->log_root_tree->node->start); | ||
1975 | btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, | ||
1976 | btrfs_header_level(log->fs_info->log_root_tree->node)); | ||
1977 | |||
1978 | write_ctree_super(trans, log->fs_info->tree_root, 2); | ||
1979 | log->fs_info->tree_log_transid++; | ||
1980 | log->fs_info->tree_log_batch = 0; | ||
1981 | atomic_set(&log->fs_info->tree_log_commit, 0); | ||
1982 | smp_mb(); | ||
1983 | if (waitqueue_active(&log->fs_info->tree_log_wait)) | ||
1984 | wake_up(&log->fs_info->tree_log_wait); | ||
1985 | out: | ||
1986 | mutex_unlock(&log->fs_info->tree_log_mutex); | ||
1987 | return 0; | ||
1988 | } | ||
1989 | |||
1990 | /* * free all the extents used by the tree log. This should be called | ||
1991 | * at commit time of the full transaction | ||
1992 | */ | ||
1993 | int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) | ||
1994 | { | ||
1995 | int ret; | ||
1996 | struct btrfs_root *log; | ||
1997 | struct key; | ||
1998 | u64 start; | ||
1999 | u64 end; | ||
2000 | struct walk_control wc = { | ||
2001 | .free = 1, | ||
2002 | .process_func = process_one_buffer | ||
2003 | }; | ||
2004 | |||
2005 | if (!root->log_root || root->fs_info->log_root_recovering) | ||
2006 | return 0; | ||
2007 | |||
2008 | log = root->log_root; | ||
2009 | ret = walk_log_tree(trans, log, &wc); | ||
2010 | BUG_ON(ret); | ||
2011 | |||
2012 | while (1) { | ||
2013 | ret = find_first_extent_bit(&log->dirty_log_pages, | ||
2014 | 0, &start, &end, EXTENT_DIRTY); | ||
2015 | if (ret) | ||
2016 | break; | ||
2017 | |||
2018 | clear_extent_dirty(&log->dirty_log_pages, | ||
2019 | start, end, GFP_NOFS); | ||
2020 | } | ||
2021 | |||
2022 | log = root->log_root; | ||
2023 | ret = btrfs_del_root(trans, root->fs_info->log_root_tree, | ||
2024 | &log->root_key); | ||
2025 | BUG_ON(ret); | ||
2026 | root->log_root = NULL; | ||
2027 | kfree(root->log_root); | ||
2028 | return 0; | ||
2029 | } | ||
2030 | |||
2031 | /* | ||
2032 | * helper function to update the item for a given subvolumes log root | ||
2033 | * in the tree of log roots | ||
2034 | */ | ||
2035 | static int update_log_root(struct btrfs_trans_handle *trans, | ||
2036 | struct btrfs_root *log) | ||
2037 | { | ||
2038 | u64 bytenr = btrfs_root_bytenr(&log->root_item); | ||
2039 | int ret; | ||
2040 | |||
2041 | if (log->node->start == bytenr) | ||
2042 | return 0; | ||
2043 | |||
2044 | btrfs_set_root_bytenr(&log->root_item, log->node->start); | ||
2045 | btrfs_set_root_generation(&log->root_item, trans->transid); | ||
2046 | btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); | ||
2047 | ret = btrfs_update_root(trans, log->fs_info->log_root_tree, | ||
2048 | &log->root_key, &log->root_item); | ||
2049 | BUG_ON(ret); | ||
2050 | return ret; | ||
2051 | } | ||
2052 | |||
2053 | /* | ||
2054 | * If both a file and directory are logged, and unlinks or renames are | ||
2055 | * mixed in, we have a few interesting corners: | ||
2056 | * | ||
2057 | * create file X in dir Y | ||
2058 | * link file X to X.link in dir Y | ||
2059 | * fsync file X | ||
2060 | * unlink file X but leave X.link | ||
2061 | * fsync dir Y | ||
2062 | * | ||
2063 | * After a crash we would expect only X.link to exist. But file X | ||
2064 | * didn't get fsync'd again so the log has back refs for X and X.link. | ||
2065 | * | ||
2066 | * We solve this by removing directory entries and inode backrefs from the | ||
2067 | * log when a file that was logged in the current transaction is | ||
2068 | * unlinked. Any later fsync will include the updated log entries, and | ||
2069 | * we'll be able to reconstruct the proper directory items from backrefs. | ||
2070 | * | ||
2071 | * This optimizations allows us to avoid relogging the entire inode | ||
2072 | * or the entire directory. | ||
2073 | */ | ||
2074 | int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, | ||
2075 | struct btrfs_root *root, | ||
2076 | const char *name, int name_len, | ||
2077 | struct inode *dir, u64 index) | ||
2078 | { | ||
2079 | struct btrfs_root *log; | ||
2080 | struct btrfs_dir_item *di; | ||
2081 | struct btrfs_path *path; | ||
2082 | int ret; | ||
2083 | int bytes_del = 0; | ||
2084 | |||
2085 | if (BTRFS_I(dir)->logged_trans < trans->transid) | ||
2086 | return 0; | ||
2087 | |||
2088 | ret = join_running_log_trans(root); | ||
2089 | if (ret) | ||
2090 | return 0; | ||
2091 | |||
2092 | mutex_lock(&BTRFS_I(dir)->log_mutex); | ||
2093 | |||
2094 | log = root->log_root; | ||
2095 | path = btrfs_alloc_path(); | ||
2096 | di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, | ||
2097 | name, name_len, -1); | ||
2098 | if (di && !IS_ERR(di)) { | ||
2099 | ret = btrfs_delete_one_dir_name(trans, log, path, di); | ||
2100 | bytes_del += name_len; | ||
2101 | BUG_ON(ret); | ||
2102 | } | ||
2103 | btrfs_release_path(log, path); | ||
2104 | di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, | ||
2105 | index, name, name_len, -1); | ||
2106 | if (di && !IS_ERR(di)) { | ||
2107 | ret = btrfs_delete_one_dir_name(trans, log, path, di); | ||
2108 | bytes_del += name_len; | ||
2109 | BUG_ON(ret); | ||
2110 | } | ||
2111 | |||
2112 | /* update the directory size in the log to reflect the names | ||
2113 | * we have removed | ||
2114 | */ | ||
2115 | if (bytes_del) { | ||
2116 | struct btrfs_key key; | ||
2117 | |||
2118 | key.objectid = dir->i_ino; | ||
2119 | key.offset = 0; | ||
2120 | key.type = BTRFS_INODE_ITEM_KEY; | ||
2121 | btrfs_release_path(log, path); | ||
2122 | |||
2123 | ret = btrfs_search_slot(trans, log, &key, path, 0, 1); | ||
2124 | if (ret == 0) { | ||
2125 | struct btrfs_inode_item *item; | ||
2126 | u64 i_size; | ||
2127 | |||
2128 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
2129 | struct btrfs_inode_item); | ||
2130 | i_size = btrfs_inode_size(path->nodes[0], item); | ||
2131 | if (i_size > bytes_del) | ||
2132 | i_size -= bytes_del; | ||
2133 | else | ||
2134 | i_size = 0; | ||
2135 | btrfs_set_inode_size(path->nodes[0], item, i_size); | ||
2136 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
2137 | } else | ||
2138 | ret = 0; | ||
2139 | btrfs_release_path(log, path); | ||
2140 | } | ||
2141 | |||
2142 | btrfs_free_path(path); | ||
2143 | mutex_unlock(&BTRFS_I(dir)->log_mutex); | ||
2144 | end_log_trans(root); | ||
2145 | |||
2146 | return 0; | ||
2147 | } | ||
2148 | |||
2149 | /* see comments for btrfs_del_dir_entries_in_log */ | ||
2150 | int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, | ||
2151 | struct btrfs_root *root, | ||
2152 | const char *name, int name_len, | ||
2153 | struct inode *inode, u64 dirid) | ||
2154 | { | ||
2155 | struct btrfs_root *log; | ||
2156 | u64 index; | ||
2157 | int ret; | ||
2158 | |||
2159 | if (BTRFS_I(inode)->logged_trans < trans->transid) | ||
2160 | return 0; | ||
2161 | |||
2162 | ret = join_running_log_trans(root); | ||
2163 | if (ret) | ||
2164 | return 0; | ||
2165 | log = root->log_root; | ||
2166 | mutex_lock(&BTRFS_I(inode)->log_mutex); | ||
2167 | |||
2168 | ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, | ||
2169 | dirid, &index); | ||
2170 | mutex_unlock(&BTRFS_I(inode)->log_mutex); | ||
2171 | end_log_trans(root); | ||
2172 | |||
2173 | return ret; | ||
2174 | } | ||
2175 | |||
2176 | /* | ||
2177 | * creates a range item in the log for 'dirid'. first_offset and | ||
2178 | * last_offset tell us which parts of the key space the log should | ||
2179 | * be considered authoritative for. | ||
2180 | */ | ||
2181 | static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, | ||
2182 | struct btrfs_root *log, | ||
2183 | struct btrfs_path *path, | ||
2184 | int key_type, u64 dirid, | ||
2185 | u64 first_offset, u64 last_offset) | ||
2186 | { | ||
2187 | int ret; | ||
2188 | struct btrfs_key key; | ||
2189 | struct btrfs_dir_log_item *item; | ||
2190 | |||
2191 | key.objectid = dirid; | ||
2192 | key.offset = first_offset; | ||
2193 | if (key_type == BTRFS_DIR_ITEM_KEY) | ||
2194 | key.type = BTRFS_DIR_LOG_ITEM_KEY; | ||
2195 | else | ||
2196 | key.type = BTRFS_DIR_LOG_INDEX_KEY; | ||
2197 | ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); | ||
2198 | BUG_ON(ret); | ||
2199 | |||
2200 | item = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
2201 | struct btrfs_dir_log_item); | ||
2202 | btrfs_set_dir_log_end(path->nodes[0], item, last_offset); | ||
2203 | btrfs_mark_buffer_dirty(path->nodes[0]); | ||
2204 | btrfs_release_path(log, path); | ||
2205 | return 0; | ||
2206 | } | ||
2207 | |||
2208 | /* | ||
2209 | * log all the items included in the current transaction for a given | ||
2210 | * directory. This also creates the range items in the log tree required | ||
2211 | * to replay anything deleted before the fsync | ||
2212 | */ | ||
2213 | static noinline int log_dir_items(struct btrfs_trans_handle *trans, | ||
2214 | struct btrfs_root *root, struct inode *inode, | ||
2215 | struct btrfs_path *path, | ||
2216 | struct btrfs_path *dst_path, int key_type, | ||
2217 | u64 min_offset, u64 *last_offset_ret) | ||
2218 | { | ||
2219 | struct btrfs_key min_key; | ||
2220 | struct btrfs_key max_key; | ||
2221 | struct btrfs_root *log = root->log_root; | ||
2222 | struct extent_buffer *src; | ||
2223 | int ret; | ||
2224 | int i; | ||
2225 | int nritems; | ||
2226 | u64 first_offset = min_offset; | ||
2227 | u64 last_offset = (u64)-1; | ||
2228 | |||
2229 | log = root->log_root; | ||
2230 | max_key.objectid = inode->i_ino; | ||
2231 | max_key.offset = (u64)-1; | ||
2232 | max_key.type = key_type; | ||
2233 | |||
2234 | min_key.objectid = inode->i_ino; | ||
2235 | min_key.type = key_type; | ||
2236 | min_key.offset = min_offset; | ||
2237 | |||
2238 | path->keep_locks = 1; | ||
2239 | |||
2240 | ret = btrfs_search_forward(root, &min_key, &max_key, | ||
2241 | path, 0, trans->transid); | ||
2242 | |||
2243 | /* | ||
2244 | * we didn't find anything from this transaction, see if there | ||
2245 | * is anything at all | ||
2246 | */ | ||
2247 | if (ret != 0 || min_key.objectid != inode->i_ino || | ||
2248 | min_key.type != key_type) { | ||
2249 | min_key.objectid = inode->i_ino; | ||
2250 | min_key.type = key_type; | ||
2251 | min_key.offset = (u64)-1; | ||
2252 | btrfs_release_path(root, path); | ||
2253 | ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); | ||
2254 | if (ret < 0) { | ||
2255 | btrfs_release_path(root, path); | ||
2256 | return ret; | ||
2257 | } | ||
2258 | ret = btrfs_previous_item(root, path, inode->i_ino, key_type); | ||
2259 | |||
2260 | /* if ret == 0 there are items for this type, | ||
2261 | * create a range to tell us the last key of this type. | ||
2262 | * otherwise, there are no items in this directory after | ||
2263 | * *min_offset, and we create a range to indicate that. | ||
2264 | */ | ||
2265 | if (ret == 0) { | ||
2266 | struct btrfs_key tmp; | ||
2267 | btrfs_item_key_to_cpu(path->nodes[0], &tmp, | ||
2268 | path->slots[0]); | ||
2269 | if (key_type == tmp.type) | ||
2270 | first_offset = max(min_offset, tmp.offset) + 1; | ||
2271 | } | ||
2272 | goto done; | ||
2273 | } | ||
2274 | |||
2275 | /* go backward to find any previous key */ | ||
2276 | ret = btrfs_previous_item(root, path, inode->i_ino, key_type); | ||
2277 | if (ret == 0) { | ||
2278 | struct btrfs_key tmp; | ||
2279 | btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); | ||
2280 | if (key_type == tmp.type) { | ||
2281 | first_offset = tmp.offset; | ||
2282 | ret = overwrite_item(trans, log, dst_path, | ||
2283 | path->nodes[0], path->slots[0], | ||
2284 | &tmp); | ||
2285 | } | ||
2286 | } | ||
2287 | btrfs_release_path(root, path); | ||
2288 | |||
2289 | /* find the first key from this transaction again */ | ||
2290 | ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); | ||
2291 | if (ret != 0) { | ||
2292 | WARN_ON(1); | ||
2293 | goto done; | ||
2294 | } | ||
2295 | |||
2296 | /* | ||
2297 | * we have a block from this transaction, log every item in it | ||
2298 | * from our directory | ||
2299 | */ | ||
2300 | while (1) { | ||
2301 | struct btrfs_key tmp; | ||
2302 | src = path->nodes[0]; | ||
2303 | nritems = btrfs_header_nritems(src); | ||
2304 | for (i = path->slots[0]; i < nritems; i++) { | ||
2305 | btrfs_item_key_to_cpu(src, &min_key, i); | ||
2306 | |||
2307 | if (min_key.objectid != inode->i_ino || | ||
2308 | min_key.type != key_type) | ||
2309 | goto done; | ||
2310 | ret = overwrite_item(trans, log, dst_path, src, i, | ||
2311 | &min_key); | ||
2312 | BUG_ON(ret); | ||
2313 | } | ||
2314 | path->slots[0] = nritems; | ||
2315 | |||
2316 | /* | ||
2317 | * look ahead to the next item and see if it is also | ||
2318 | * from this directory and from this transaction | ||
2319 | */ | ||
2320 | ret = btrfs_next_leaf(root, path); | ||
2321 | if (ret == 1) { | ||
2322 | last_offset = (u64)-1; | ||
2323 | goto done; | ||
2324 | } | ||
2325 | btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); | ||
2326 | if (tmp.objectid != inode->i_ino || tmp.type != key_type) { | ||
2327 | last_offset = (u64)-1; | ||
2328 | goto done; | ||
2329 | } | ||
2330 | if (btrfs_header_generation(path->nodes[0]) != trans->transid) { | ||
2331 | ret = overwrite_item(trans, log, dst_path, | ||
2332 | path->nodes[0], path->slots[0], | ||
2333 | &tmp); | ||
2334 | |||
2335 | BUG_ON(ret); | ||
2336 | last_offset = tmp.offset; | ||
2337 | goto done; | ||
2338 | } | ||
2339 | } | ||
2340 | done: | ||
2341 | *last_offset_ret = last_offset; | ||
2342 | btrfs_release_path(root, path); | ||
2343 | btrfs_release_path(log, dst_path); | ||
2344 | |||
2345 | /* insert the log range keys to indicate where the log is valid */ | ||
2346 | ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, | ||
2347 | first_offset, last_offset); | ||
2348 | BUG_ON(ret); | ||
2349 | return 0; | ||
2350 | } | ||
2351 | |||
2352 | /* | ||
2353 | * logging directories is very similar to logging inodes, We find all the items | ||
2354 | * from the current transaction and write them to the log. | ||
2355 | * | ||
2356 | * The recovery code scans the directory in the subvolume, and if it finds a | ||
2357 | * key in the range logged that is not present in the log tree, then it means | ||
2358 | * that dir entry was unlinked during the transaction. | ||
2359 | * | ||
2360 | * In order for that scan to work, we must include one key smaller than | ||
2361 | * the smallest logged by this transaction and one key larger than the largest | ||
2362 | * key logged by this transaction. | ||
2363 | */ | ||
2364 | static noinline int log_directory_changes(struct btrfs_trans_handle *trans, | ||
2365 | struct btrfs_root *root, struct inode *inode, | ||
2366 | struct btrfs_path *path, | ||
2367 | struct btrfs_path *dst_path) | ||
2368 | { | ||
2369 | u64 min_key; | ||
2370 | u64 max_key; | ||
2371 | int ret; | ||
2372 | int key_type = BTRFS_DIR_ITEM_KEY; | ||
2373 | |||
2374 | again: | ||
2375 | min_key = 0; | ||
2376 | max_key = 0; | ||
2377 | while (1) { | ||
2378 | ret = log_dir_items(trans, root, inode, path, | ||
2379 | dst_path, key_type, min_key, | ||
2380 | &max_key); | ||
2381 | BUG_ON(ret); | ||
2382 | if (max_key == (u64)-1) | ||
2383 | break; | ||
2384 | min_key = max_key + 1; | ||
2385 | } | ||
2386 | |||
2387 | if (key_type == BTRFS_DIR_ITEM_KEY) { | ||
2388 | key_type = BTRFS_DIR_INDEX_KEY; | ||
2389 | goto again; | ||
2390 | } | ||
2391 | return 0; | ||
2392 | } | ||
2393 | |||
2394 | /* | ||
2395 | * a helper function to drop items from the log before we relog an | ||
2396 | * inode. max_key_type indicates the highest item type to remove. | ||
2397 | * This cannot be run for file data extents because it does not | ||
2398 | * free the extents they point to. | ||
2399 | */ | ||
2400 | static int drop_objectid_items(struct btrfs_trans_handle *trans, | ||
2401 | struct btrfs_root *log, | ||
2402 | struct btrfs_path *path, | ||
2403 | u64 objectid, int max_key_type) | ||
2404 | { | ||
2405 | int ret; | ||
2406 | struct btrfs_key key; | ||
2407 | struct btrfs_key found_key; | ||
2408 | |||
2409 | key.objectid = objectid; | ||
2410 | key.type = max_key_type; | ||
2411 | key.offset = (u64)-1; | ||
2412 | |||
2413 | while (1) { | ||
2414 | ret = btrfs_search_slot(trans, log, &key, path, -1, 1); | ||
2415 | |||
2416 | if (ret != 1) | ||
2417 | break; | ||
2418 | |||
2419 | if (path->slots[0] == 0) | ||
2420 | break; | ||
2421 | |||
2422 | path->slots[0]--; | ||
2423 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
2424 | path->slots[0]); | ||
2425 | |||
2426 | if (found_key.objectid != objectid) | ||
2427 | break; | ||
2428 | |||
2429 | ret = btrfs_del_item(trans, log, path); | ||
2430 | BUG_ON(ret); | ||
2431 | btrfs_release_path(log, path); | ||
2432 | } | ||
2433 | btrfs_release_path(log, path); | ||
2434 | return 0; | ||
2435 | } | ||
2436 | |||
2437 | static noinline int copy_items(struct btrfs_trans_handle *trans, | ||
2438 | struct btrfs_root *log, | ||
2439 | struct btrfs_path *dst_path, | ||
2440 | struct extent_buffer *src, | ||
2441 | int start_slot, int nr, int inode_only) | ||
2442 | { | ||
2443 | unsigned long src_offset; | ||
2444 | unsigned long dst_offset; | ||
2445 | struct btrfs_file_extent_item *extent; | ||
2446 | struct btrfs_inode_item *inode_item; | ||
2447 | int ret; | ||
2448 | struct btrfs_key *ins_keys; | ||
2449 | u32 *ins_sizes; | ||
2450 | char *ins_data; | ||
2451 | int i; | ||
2452 | struct list_head ordered_sums; | ||
2453 | |||
2454 | INIT_LIST_HEAD(&ordered_sums); | ||
2455 | |||
2456 | ins_data = kmalloc(nr * sizeof(struct btrfs_key) + | ||
2457 | nr * sizeof(u32), GFP_NOFS); | ||
2458 | ins_sizes = (u32 *)ins_data; | ||
2459 | ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); | ||
2460 | |||
2461 | for (i = 0; i < nr; i++) { | ||
2462 | ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); | ||
2463 | btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); | ||
2464 | } | ||
2465 | ret = btrfs_insert_empty_items(trans, log, dst_path, | ||
2466 | ins_keys, ins_sizes, nr); | ||
2467 | BUG_ON(ret); | ||
2468 | |||
2469 | for (i = 0; i < nr; i++) { | ||
2470 | dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], | ||
2471 | dst_path->slots[0]); | ||
2472 | |||
2473 | src_offset = btrfs_item_ptr_offset(src, start_slot + i); | ||
2474 | |||
2475 | copy_extent_buffer(dst_path->nodes[0], src, dst_offset, | ||
2476 | src_offset, ins_sizes[i]); | ||
2477 | |||
2478 | if (inode_only == LOG_INODE_EXISTS && | ||
2479 | ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { | ||
2480 | inode_item = btrfs_item_ptr(dst_path->nodes[0], | ||
2481 | dst_path->slots[0], | ||
2482 | struct btrfs_inode_item); | ||
2483 | btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); | ||
2484 | |||
2485 | /* set the generation to zero so the recover code | ||
2486 | * can tell the difference between an logging | ||
2487 | * just to say 'this inode exists' and a logging | ||
2488 | * to say 'update this inode with these values' | ||
2489 | */ | ||
2490 | btrfs_set_inode_generation(dst_path->nodes[0], | ||
2491 | inode_item, 0); | ||
2492 | } | ||
2493 | /* take a reference on file data extents so that truncates | ||
2494 | * or deletes of this inode don't have to relog the inode | ||
2495 | * again | ||
2496 | */ | ||
2497 | if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { | ||
2498 | int found_type; | ||
2499 | extent = btrfs_item_ptr(src, start_slot + i, | ||
2500 | struct btrfs_file_extent_item); | ||
2501 | |||
2502 | found_type = btrfs_file_extent_type(src, extent); | ||
2503 | if (found_type == BTRFS_FILE_EXTENT_REG || | ||
2504 | found_type == BTRFS_FILE_EXTENT_PREALLOC) { | ||
2505 | u64 ds = btrfs_file_extent_disk_bytenr(src, | ||
2506 | extent); | ||
2507 | u64 dl = btrfs_file_extent_disk_num_bytes(src, | ||
2508 | extent); | ||
2509 | u64 cs = btrfs_file_extent_offset(src, extent); | ||
2510 | u64 cl = btrfs_file_extent_num_bytes(src, | ||
2511 | extent);; | ||
2512 | if (btrfs_file_extent_compression(src, | ||
2513 | extent)) { | ||
2514 | cs = 0; | ||
2515 | cl = dl; | ||
2516 | } | ||
2517 | /* ds == 0 is a hole */ | ||
2518 | if (ds != 0) { | ||
2519 | ret = btrfs_inc_extent_ref(trans, log, | ||
2520 | ds, dl, | ||
2521 | dst_path->nodes[0]->start, | ||
2522 | BTRFS_TREE_LOG_OBJECTID, | ||
2523 | trans->transid, | ||
2524 | ins_keys[i].objectid); | ||
2525 | BUG_ON(ret); | ||
2526 | ret = btrfs_lookup_csums_range( | ||
2527 | log->fs_info->csum_root, | ||
2528 | ds + cs, ds + cs + cl - 1, | ||
2529 | &ordered_sums); | ||
2530 | BUG_ON(ret); | ||
2531 | } | ||
2532 | } | ||
2533 | } | ||
2534 | dst_path->slots[0]++; | ||
2535 | } | ||
2536 | |||
2537 | btrfs_mark_buffer_dirty(dst_path->nodes[0]); | ||
2538 | btrfs_release_path(log, dst_path); | ||
2539 | kfree(ins_data); | ||
2540 | |||
2541 | /* | ||
2542 | * we have to do this after the loop above to avoid changing the | ||
2543 | * log tree while trying to change the log tree. | ||
2544 | */ | ||
2545 | while (!list_empty(&ordered_sums)) { | ||
2546 | struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, | ||
2547 | struct btrfs_ordered_sum, | ||
2548 | list); | ||
2549 | ret = btrfs_csum_file_blocks(trans, log, sums); | ||
2550 | BUG_ON(ret); | ||
2551 | list_del(&sums->list); | ||
2552 | kfree(sums); | ||
2553 | } | ||
2554 | return 0; | ||
2555 | } | ||
2556 | |||
2557 | /* log a single inode in the tree log. | ||
2558 | * At least one parent directory for this inode must exist in the tree | ||
2559 | * or be logged already. | ||
2560 | * | ||
2561 | * Any items from this inode changed by the current transaction are copied | ||
2562 | * to the log tree. An extra reference is taken on any extents in this | ||
2563 | * file, allowing us to avoid a whole pile of corner cases around logging | ||
2564 | * blocks that have been removed from the tree. | ||
2565 | * | ||
2566 | * See LOG_INODE_ALL and related defines for a description of what inode_only | ||
2567 | * does. | ||
2568 | * | ||
2569 | * This handles both files and directories. | ||
2570 | */ | ||
2571 | static int __btrfs_log_inode(struct btrfs_trans_handle *trans, | ||
2572 | struct btrfs_root *root, struct inode *inode, | ||
2573 | int inode_only) | ||
2574 | { | ||
2575 | struct btrfs_path *path; | ||
2576 | struct btrfs_path *dst_path; | ||
2577 | struct btrfs_key min_key; | ||
2578 | struct btrfs_key max_key; | ||
2579 | struct btrfs_root *log = root->log_root; | ||
2580 | struct extent_buffer *src = NULL; | ||
2581 | u32 size; | ||
2582 | int ret; | ||
2583 | int nritems; | ||
2584 | int ins_start_slot = 0; | ||
2585 | int ins_nr; | ||
2586 | |||
2587 | log = root->log_root; | ||
2588 | |||
2589 | path = btrfs_alloc_path(); | ||
2590 | dst_path = btrfs_alloc_path(); | ||
2591 | |||
2592 | min_key.objectid = inode->i_ino; | ||
2593 | min_key.type = BTRFS_INODE_ITEM_KEY; | ||
2594 | min_key.offset = 0; | ||
2595 | |||
2596 | max_key.objectid = inode->i_ino; | ||
2597 | if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) | ||
2598 | max_key.type = BTRFS_XATTR_ITEM_KEY; | ||
2599 | else | ||
2600 | max_key.type = (u8)-1; | ||
2601 | max_key.offset = (u64)-1; | ||
2602 | |||
2603 | /* | ||
2604 | * if this inode has already been logged and we're in inode_only | ||
2605 | * mode, we don't want to delete the things that have already | ||
2606 | * been written to the log. | ||
2607 | * | ||
2608 | * But, if the inode has been through an inode_only log, | ||
2609 | * the logged_trans field is not set. This allows us to catch | ||
2610 | * any new names for this inode in the backrefs by logging it | ||
2611 | * again | ||
2612 | */ | ||
2613 | if (inode_only == LOG_INODE_EXISTS && | ||
2614 | BTRFS_I(inode)->logged_trans == trans->transid) { | ||
2615 | btrfs_free_path(path); | ||
2616 | btrfs_free_path(dst_path); | ||
2617 | goto out; | ||
2618 | } | ||
2619 | mutex_lock(&BTRFS_I(inode)->log_mutex); | ||
2620 | |||
2621 | /* | ||
2622 | * a brute force approach to making sure we get the most uptodate | ||
2623 | * copies of everything. | ||
2624 | */ | ||
2625 | if (S_ISDIR(inode->i_mode)) { | ||
2626 | int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; | ||
2627 | |||
2628 | if (inode_only == LOG_INODE_EXISTS) | ||
2629 | max_key_type = BTRFS_XATTR_ITEM_KEY; | ||
2630 | ret = drop_objectid_items(trans, log, path, | ||
2631 | inode->i_ino, max_key_type); | ||
2632 | } else { | ||
2633 | ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); | ||
2634 | } | ||
2635 | BUG_ON(ret); | ||
2636 | path->keep_locks = 1; | ||
2637 | |||
2638 | while (1) { | ||
2639 | ins_nr = 0; | ||
2640 | ret = btrfs_search_forward(root, &min_key, &max_key, | ||
2641 | path, 0, trans->transid); | ||
2642 | if (ret != 0) | ||
2643 | break; | ||
2644 | again: | ||
2645 | /* note, ins_nr might be > 0 here, cleanup outside the loop */ | ||
2646 | if (min_key.objectid != inode->i_ino) | ||
2647 | break; | ||
2648 | if (min_key.type > max_key.type) | ||
2649 | break; | ||
2650 | |||
2651 | src = path->nodes[0]; | ||
2652 | size = btrfs_item_size_nr(src, path->slots[0]); | ||
2653 | if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { | ||
2654 | ins_nr++; | ||
2655 | goto next_slot; | ||
2656 | } else if (!ins_nr) { | ||
2657 | ins_start_slot = path->slots[0]; | ||
2658 | ins_nr = 1; | ||
2659 | goto next_slot; | ||
2660 | } | ||
2661 | |||
2662 | ret = copy_items(trans, log, dst_path, src, ins_start_slot, | ||
2663 | ins_nr, inode_only); | ||
2664 | BUG_ON(ret); | ||
2665 | ins_nr = 1; | ||
2666 | ins_start_slot = path->slots[0]; | ||
2667 | next_slot: | ||
2668 | |||
2669 | nritems = btrfs_header_nritems(path->nodes[0]); | ||
2670 | path->slots[0]++; | ||
2671 | if (path->slots[0] < nritems) { | ||
2672 | btrfs_item_key_to_cpu(path->nodes[0], &min_key, | ||
2673 | path->slots[0]); | ||
2674 | goto again; | ||
2675 | } | ||
2676 | if (ins_nr) { | ||
2677 | ret = copy_items(trans, log, dst_path, src, | ||
2678 | ins_start_slot, | ||
2679 | ins_nr, inode_only); | ||
2680 | BUG_ON(ret); | ||
2681 | ins_nr = 0; | ||
2682 | } | ||
2683 | btrfs_release_path(root, path); | ||
2684 | |||
2685 | if (min_key.offset < (u64)-1) | ||
2686 | min_key.offset++; | ||
2687 | else if (min_key.type < (u8)-1) | ||
2688 | min_key.type++; | ||
2689 | else if (min_key.objectid < (u64)-1) | ||
2690 | min_key.objectid++; | ||
2691 | else | ||
2692 | break; | ||
2693 | } | ||
2694 | if (ins_nr) { | ||
2695 | ret = copy_items(trans, log, dst_path, src, | ||
2696 | ins_start_slot, | ||
2697 | ins_nr, inode_only); | ||
2698 | BUG_ON(ret); | ||
2699 | ins_nr = 0; | ||
2700 | } | ||
2701 | WARN_ON(ins_nr); | ||
2702 | if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { | ||
2703 | btrfs_release_path(root, path); | ||
2704 | btrfs_release_path(log, dst_path); | ||
2705 | BTRFS_I(inode)->log_dirty_trans = 0; | ||
2706 | ret = log_directory_changes(trans, root, inode, path, dst_path); | ||
2707 | BUG_ON(ret); | ||
2708 | } | ||
2709 | BTRFS_I(inode)->logged_trans = trans->transid; | ||
2710 | mutex_unlock(&BTRFS_I(inode)->log_mutex); | ||
2711 | |||
2712 | btrfs_free_path(path); | ||
2713 | btrfs_free_path(dst_path); | ||
2714 | |||
2715 | mutex_lock(&root->fs_info->tree_log_mutex); | ||
2716 | ret = update_log_root(trans, log); | ||
2717 | BUG_ON(ret); | ||
2718 | mutex_unlock(&root->fs_info->tree_log_mutex); | ||
2719 | out: | ||
2720 | return 0; | ||
2721 | } | ||
2722 | |||
2723 | int btrfs_log_inode(struct btrfs_trans_handle *trans, | ||
2724 | struct btrfs_root *root, struct inode *inode, | ||
2725 | int inode_only) | ||
2726 | { | ||
2727 | int ret; | ||
2728 | |||
2729 | start_log_trans(trans, root); | ||
2730 | ret = __btrfs_log_inode(trans, root, inode, inode_only); | ||
2731 | end_log_trans(root); | ||
2732 | return ret; | ||
2733 | } | ||
2734 | |||
2735 | /* | ||
2736 | * helper function around btrfs_log_inode to make sure newly created | ||
2737 | * parent directories also end up in the log. A minimal inode and backref | ||
2738 | * only logging is done of any parent directories that are older than | ||
2739 | * the last committed transaction | ||
2740 | */ | ||
2741 | int btrfs_log_dentry(struct btrfs_trans_handle *trans, | ||
2742 | struct btrfs_root *root, struct dentry *dentry) | ||
2743 | { | ||
2744 | int inode_only = LOG_INODE_ALL; | ||
2745 | struct super_block *sb; | ||
2746 | int ret; | ||
2747 | |||
2748 | start_log_trans(trans, root); | ||
2749 | sb = dentry->d_inode->i_sb; | ||
2750 | while (1) { | ||
2751 | ret = __btrfs_log_inode(trans, root, dentry->d_inode, | ||
2752 | inode_only); | ||
2753 | BUG_ON(ret); | ||
2754 | inode_only = LOG_INODE_EXISTS; | ||
2755 | |||
2756 | dentry = dentry->d_parent; | ||
2757 | if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) | ||
2758 | break; | ||
2759 | |||
2760 | if (BTRFS_I(dentry->d_inode)->generation <= | ||
2761 | root->fs_info->last_trans_committed) | ||
2762 | break; | ||
2763 | } | ||
2764 | end_log_trans(root); | ||
2765 | return 0; | ||
2766 | } | ||
2767 | |||
2768 | /* | ||
2769 | * it is not safe to log dentry if the chunk root has added new | ||
2770 | * chunks. This returns 0 if the dentry was logged, and 1 otherwise. | ||
2771 | * If this returns 1, you must commit the transaction to safely get your | ||
2772 | * data on disk. | ||
2773 | */ | ||
2774 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, | ||
2775 | struct btrfs_root *root, struct dentry *dentry) | ||
2776 | { | ||
2777 | u64 gen; | ||
2778 | gen = root->fs_info->last_trans_new_blockgroup; | ||
2779 | if (gen > root->fs_info->last_trans_committed) | ||
2780 | return 1; | ||
2781 | else | ||
2782 | return btrfs_log_dentry(trans, root, dentry); | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * should be called during mount to recover any replay any log trees | ||
2787 | * from the FS | ||
2788 | */ | ||
2789 | int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) | ||
2790 | { | ||
2791 | int ret; | ||
2792 | struct btrfs_path *path; | ||
2793 | struct btrfs_trans_handle *trans; | ||
2794 | struct btrfs_key key; | ||
2795 | struct btrfs_key found_key; | ||
2796 | struct btrfs_key tmp_key; | ||
2797 | struct btrfs_root *log; | ||
2798 | struct btrfs_fs_info *fs_info = log_root_tree->fs_info; | ||
2799 | u64 highest_inode; | ||
2800 | struct walk_control wc = { | ||
2801 | .process_func = process_one_buffer, | ||
2802 | .stage = 0, | ||
2803 | }; | ||
2804 | |||
2805 | fs_info->log_root_recovering = 1; | ||
2806 | path = btrfs_alloc_path(); | ||
2807 | BUG_ON(!path); | ||
2808 | |||
2809 | trans = btrfs_start_transaction(fs_info->tree_root, 1); | ||
2810 | |||
2811 | wc.trans = trans; | ||
2812 | wc.pin = 1; | ||
2813 | |||
2814 | walk_log_tree(trans, log_root_tree, &wc); | ||
2815 | |||
2816 | again: | ||
2817 | key.objectid = BTRFS_TREE_LOG_OBJECTID; | ||
2818 | key.offset = (u64)-1; | ||
2819 | btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); | ||
2820 | |||
2821 | while (1) { | ||
2822 | ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); | ||
2823 | if (ret < 0) | ||
2824 | break; | ||
2825 | if (ret > 0) { | ||
2826 | if (path->slots[0] == 0) | ||
2827 | break; | ||
2828 | path->slots[0]--; | ||
2829 | } | ||
2830 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
2831 | path->slots[0]); | ||
2832 | btrfs_release_path(log_root_tree, path); | ||
2833 | if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) | ||
2834 | break; | ||
2835 | |||
2836 | log = btrfs_read_fs_root_no_radix(log_root_tree, | ||
2837 | &found_key); | ||
2838 | BUG_ON(!log); | ||
2839 | |||
2840 | |||
2841 | tmp_key.objectid = found_key.offset; | ||
2842 | tmp_key.type = BTRFS_ROOT_ITEM_KEY; | ||
2843 | tmp_key.offset = (u64)-1; | ||
2844 | |||
2845 | wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); | ||
2846 | BUG_ON(!wc.replay_dest); | ||
2847 | |||
2848 | wc.replay_dest->log_root = log; | ||
2849 | btrfs_record_root_in_trans(wc.replay_dest); | ||
2850 | ret = walk_log_tree(trans, log, &wc); | ||
2851 | BUG_ON(ret); | ||
2852 | |||
2853 | if (wc.stage == LOG_WALK_REPLAY_ALL) { | ||
2854 | ret = fixup_inode_link_counts(trans, wc.replay_dest, | ||
2855 | path); | ||
2856 | BUG_ON(ret); | ||
2857 | } | ||
2858 | ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); | ||
2859 | if (ret == 0) { | ||
2860 | wc.replay_dest->highest_inode = highest_inode; | ||
2861 | wc.replay_dest->last_inode_alloc = highest_inode; | ||
2862 | } | ||
2863 | |||
2864 | key.offset = found_key.offset - 1; | ||
2865 | wc.replay_dest->log_root = NULL; | ||
2866 | free_extent_buffer(log->node); | ||
2867 | kfree(log); | ||
2868 | |||
2869 | if (found_key.offset == 0) | ||
2870 | break; | ||
2871 | } | ||
2872 | btrfs_release_path(log_root_tree, path); | ||
2873 | |||
2874 | /* step one is to pin it all, step two is to replay just inodes */ | ||
2875 | if (wc.pin) { | ||
2876 | wc.pin = 0; | ||
2877 | wc.process_func = replay_one_buffer; | ||
2878 | wc.stage = LOG_WALK_REPLAY_INODES; | ||
2879 | goto again; | ||
2880 | } | ||
2881 | /* step three is to replay everything */ | ||
2882 | if (wc.stage < LOG_WALK_REPLAY_ALL) { | ||
2883 | wc.stage++; | ||
2884 | goto again; | ||
2885 | } | ||
2886 | |||
2887 | btrfs_free_path(path); | ||
2888 | |||
2889 | free_extent_buffer(log_root_tree->node); | ||
2890 | log_root_tree->log_root = NULL; | ||
2891 | fs_info->log_root_recovering = 0; | ||
2892 | |||
2893 | /* step 4: commit the transaction, which also unpins the blocks */ | ||
2894 | btrfs_commit_transaction(trans, fs_info->tree_root); | ||
2895 | |||
2896 | kfree(log_root_tree); | ||
2897 | return 0; | ||
2898 | } | ||
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 000000000000..b9409b32ed02 --- /dev/null +++ b/fs/btrfs/tree-log.h | |||
@@ -0,0 +1,41 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __TREE_LOG_ | ||
20 | #define __TREE_LOG_ | ||
21 | |||
22 | int btrfs_sync_log(struct btrfs_trans_handle *trans, | ||
23 | struct btrfs_root *root); | ||
24 | int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); | ||
25 | int btrfs_log_dentry(struct btrfs_trans_handle *trans, | ||
26 | struct btrfs_root *root, struct dentry *dentry); | ||
27 | int btrfs_recover_log_trees(struct btrfs_root *tree_root); | ||
28 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, | ||
29 | struct btrfs_root *root, struct dentry *dentry); | ||
30 | int btrfs_log_inode(struct btrfs_trans_handle *trans, | ||
31 | struct btrfs_root *root, struct inode *inode, | ||
32 | int inode_only); | ||
33 | int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, | ||
34 | struct btrfs_root *root, | ||
35 | const char *name, int name_len, | ||
36 | struct inode *dir, u64 index); | ||
37 | int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, | ||
38 | struct btrfs_root *root, | ||
39 | const char *name, int name_len, | ||
40 | struct inode *inode, u64 dirid); | ||
41 | #endif | ||
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 000000000000..9bf3946d5ef2 --- /dev/null +++ b/fs/btrfs/version.h | |||
@@ -0,0 +1,4 @@ | |||
1 | #ifndef __BTRFS_VERSION_H | ||
2 | #define __BTRFS_VERSION_H | ||
3 | #define BTRFS_BUILD_VERSION "Btrfs" | ||
4 | #endif | ||
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 000000000000..1ca1952fd917 --- /dev/null +++ b/fs/btrfs/version.sh | |||
@@ -0,0 +1,43 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # determine-version -- report a useful version for releases | ||
4 | # | ||
5 | # Copyright 2008, Aron Griffis <agriffis@n01se.net> | ||
6 | # Copyright 2008, Oracle | ||
7 | # Released under the GNU GPLv2 | ||
8 | |||
9 | v="v0.16" | ||
10 | |||
11 | which git &> /dev/null | ||
12 | if [ $? == 0 ]; then | ||
13 | git branch >& /dev/null | ||
14 | if [ $? == 0 ]; then | ||
15 | if head=`git rev-parse --verify HEAD 2>/dev/null`; then | ||
16 | if tag=`git describe --tags 2>/dev/null`; then | ||
17 | v="$tag" | ||
18 | fi | ||
19 | |||
20 | # Are there uncommitted changes? | ||
21 | git update-index --refresh --unmerged > /dev/null | ||
22 | if git diff-index --name-only HEAD | \ | ||
23 | grep -v "^scripts/package" \ | ||
24 | | read dummy; then | ||
25 | v="$v"-dirty | ||
26 | fi | ||
27 | fi | ||
28 | fi | ||
29 | fi | ||
30 | |||
31 | echo "#ifndef __BUILD_VERSION" > .build-version.h | ||
32 | echo "#define __BUILD_VERSION" >> .build-version.h | ||
33 | echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h | ||
34 | echo "#endif" >> .build-version.h | ||
35 | |||
36 | diff -q version.h .build-version.h >& /dev/null | ||
37 | |||
38 | if [ $? == 0 ]; then | ||
39 | rm .build-version.h | ||
40 | exit 0 | ||
41 | fi | ||
42 | |||
43 | mv .build-version.h version.h | ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 000000000000..b187b537888e --- /dev/null +++ b/fs/btrfs/volumes.c | |||
@@ -0,0 +1,3218 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/bio.h> | ||
20 | #include <linux/buffer_head.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/random.h> | ||
23 | #include <linux/version.h> | ||
24 | #include <asm/div64.h> | ||
25 | #include "compat.h" | ||
26 | #include "ctree.h" | ||
27 | #include "extent_map.h" | ||
28 | #include "disk-io.h" | ||
29 | #include "transaction.h" | ||
30 | #include "print-tree.h" | ||
31 | #include "volumes.h" | ||
32 | #include "async-thread.h" | ||
33 | |||
34 | struct map_lookup { | ||
35 | u64 type; | ||
36 | int io_align; | ||
37 | int io_width; | ||
38 | int stripe_len; | ||
39 | int sector_size; | ||
40 | int num_stripes; | ||
41 | int sub_stripes; | ||
42 | struct btrfs_bio_stripe stripes[]; | ||
43 | }; | ||
44 | |||
45 | static int init_first_rw_device(struct btrfs_trans_handle *trans, | ||
46 | struct btrfs_root *root, | ||
47 | struct btrfs_device *device); | ||
48 | static int btrfs_relocate_sys_chunks(struct btrfs_root *root); | ||
49 | |||
50 | #define map_lookup_size(n) (sizeof(struct map_lookup) + \ | ||
51 | (sizeof(struct btrfs_bio_stripe) * (n))) | ||
52 | |||
53 | static DEFINE_MUTEX(uuid_mutex); | ||
54 | static LIST_HEAD(fs_uuids); | ||
55 | |||
56 | void btrfs_lock_volumes(void) | ||
57 | { | ||
58 | mutex_lock(&uuid_mutex); | ||
59 | } | ||
60 | |||
61 | void btrfs_unlock_volumes(void) | ||
62 | { | ||
63 | mutex_unlock(&uuid_mutex); | ||
64 | } | ||
65 | |||
66 | static void lock_chunks(struct btrfs_root *root) | ||
67 | { | ||
68 | mutex_lock(&root->fs_info->chunk_mutex); | ||
69 | } | ||
70 | |||
71 | static void unlock_chunks(struct btrfs_root *root) | ||
72 | { | ||
73 | mutex_unlock(&root->fs_info->chunk_mutex); | ||
74 | } | ||
75 | |||
76 | static void free_fs_devices(struct btrfs_fs_devices *fs_devices) | ||
77 | { | ||
78 | struct btrfs_device *device; | ||
79 | WARN_ON(fs_devices->opened); | ||
80 | while (!list_empty(&fs_devices->devices)) { | ||
81 | device = list_entry(fs_devices->devices.next, | ||
82 | struct btrfs_device, dev_list); | ||
83 | list_del(&device->dev_list); | ||
84 | kfree(device->name); | ||
85 | kfree(device); | ||
86 | } | ||
87 | kfree(fs_devices); | ||
88 | } | ||
89 | |||
90 | int btrfs_cleanup_fs_uuids(void) | ||
91 | { | ||
92 | struct btrfs_fs_devices *fs_devices; | ||
93 | |||
94 | while (!list_empty(&fs_uuids)) { | ||
95 | fs_devices = list_entry(fs_uuids.next, | ||
96 | struct btrfs_fs_devices, list); | ||
97 | list_del(&fs_devices->list); | ||
98 | free_fs_devices(fs_devices); | ||
99 | } | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | static noinline struct btrfs_device *__find_device(struct list_head *head, | ||
104 | u64 devid, u8 *uuid) | ||
105 | { | ||
106 | struct btrfs_device *dev; | ||
107 | struct list_head *cur; | ||
108 | |||
109 | list_for_each(cur, head) { | ||
110 | dev = list_entry(cur, struct btrfs_device, dev_list); | ||
111 | if (dev->devid == devid && | ||
112 | (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { | ||
113 | return dev; | ||
114 | } | ||
115 | } | ||
116 | return NULL; | ||
117 | } | ||
118 | |||
119 | static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) | ||
120 | { | ||
121 | struct list_head *cur; | ||
122 | struct btrfs_fs_devices *fs_devices; | ||
123 | |||
124 | list_for_each(cur, &fs_uuids) { | ||
125 | fs_devices = list_entry(cur, struct btrfs_fs_devices, list); | ||
126 | if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) | ||
127 | return fs_devices; | ||
128 | } | ||
129 | return NULL; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * we try to collect pending bios for a device so we don't get a large | ||
134 | * number of procs sending bios down to the same device. This greatly | ||
135 | * improves the schedulers ability to collect and merge the bios. | ||
136 | * | ||
137 | * But, it also turns into a long list of bios to process and that is sure | ||
138 | * to eventually make the worker thread block. The solution here is to | ||
139 | * make some progress and then put this work struct back at the end of | ||
140 | * the list if the block device is congested. This way, multiple devices | ||
141 | * can make progress from a single worker thread. | ||
142 | */ | ||
143 | static noinline int run_scheduled_bios(struct btrfs_device *device) | ||
144 | { | ||
145 | struct bio *pending; | ||
146 | struct backing_dev_info *bdi; | ||
147 | struct btrfs_fs_info *fs_info; | ||
148 | struct bio *tail; | ||
149 | struct bio *cur; | ||
150 | int again = 0; | ||
151 | unsigned long num_run = 0; | ||
152 | unsigned long limit; | ||
153 | |||
154 | bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; | ||
155 | fs_info = device->dev_root->fs_info; | ||
156 | limit = btrfs_async_submit_limit(fs_info); | ||
157 | limit = limit * 2 / 3; | ||
158 | |||
159 | loop: | ||
160 | spin_lock(&device->io_lock); | ||
161 | |||
162 | /* take all the bios off the list at once and process them | ||
163 | * later on (without the lock held). But, remember the | ||
164 | * tail and other pointers so the bios can be properly reinserted | ||
165 | * into the list if we hit congestion | ||
166 | */ | ||
167 | pending = device->pending_bios; | ||
168 | tail = device->pending_bio_tail; | ||
169 | WARN_ON(pending && !tail); | ||
170 | device->pending_bios = NULL; | ||
171 | device->pending_bio_tail = NULL; | ||
172 | |||
173 | /* | ||
174 | * if pending was null this time around, no bios need processing | ||
175 | * at all and we can stop. Otherwise it'll loop back up again | ||
176 | * and do an additional check so no bios are missed. | ||
177 | * | ||
178 | * device->running_pending is used to synchronize with the | ||
179 | * schedule_bio code. | ||
180 | */ | ||
181 | if (pending) { | ||
182 | again = 1; | ||
183 | device->running_pending = 1; | ||
184 | } else { | ||
185 | again = 0; | ||
186 | device->running_pending = 0; | ||
187 | } | ||
188 | spin_unlock(&device->io_lock); | ||
189 | |||
190 | while (pending) { | ||
191 | cur = pending; | ||
192 | pending = pending->bi_next; | ||
193 | cur->bi_next = NULL; | ||
194 | atomic_dec(&fs_info->nr_async_bios); | ||
195 | |||
196 | if (atomic_read(&fs_info->nr_async_bios) < limit && | ||
197 | waitqueue_active(&fs_info->async_submit_wait)) | ||
198 | wake_up(&fs_info->async_submit_wait); | ||
199 | |||
200 | BUG_ON(atomic_read(&cur->bi_cnt) == 0); | ||
201 | bio_get(cur); | ||
202 | submit_bio(cur->bi_rw, cur); | ||
203 | bio_put(cur); | ||
204 | num_run++; | ||
205 | |||
206 | /* | ||
207 | * we made progress, there is more work to do and the bdi | ||
208 | * is now congested. Back off and let other work structs | ||
209 | * run instead | ||
210 | */ | ||
211 | if (pending && bdi_write_congested(bdi) && | ||
212 | fs_info->fs_devices->open_devices > 1) { | ||
213 | struct bio *old_head; | ||
214 | |||
215 | spin_lock(&device->io_lock); | ||
216 | |||
217 | old_head = device->pending_bios; | ||
218 | device->pending_bios = pending; | ||
219 | if (device->pending_bio_tail) | ||
220 | tail->bi_next = old_head; | ||
221 | else | ||
222 | device->pending_bio_tail = tail; | ||
223 | |||
224 | spin_unlock(&device->io_lock); | ||
225 | btrfs_requeue_work(&device->work); | ||
226 | goto done; | ||
227 | } | ||
228 | } | ||
229 | if (again) | ||
230 | goto loop; | ||
231 | done: | ||
232 | return 0; | ||
233 | } | ||
234 | |||
235 | static void pending_bios_fn(struct btrfs_work *work) | ||
236 | { | ||
237 | struct btrfs_device *device; | ||
238 | |||
239 | device = container_of(work, struct btrfs_device, work); | ||
240 | run_scheduled_bios(device); | ||
241 | } | ||
242 | |||
243 | static noinline int device_list_add(const char *path, | ||
244 | struct btrfs_super_block *disk_super, | ||
245 | u64 devid, struct btrfs_fs_devices **fs_devices_ret) | ||
246 | { | ||
247 | struct btrfs_device *device; | ||
248 | struct btrfs_fs_devices *fs_devices; | ||
249 | u64 found_transid = btrfs_super_generation(disk_super); | ||
250 | |||
251 | fs_devices = find_fsid(disk_super->fsid); | ||
252 | if (!fs_devices) { | ||
253 | fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); | ||
254 | if (!fs_devices) | ||
255 | return -ENOMEM; | ||
256 | INIT_LIST_HEAD(&fs_devices->devices); | ||
257 | INIT_LIST_HEAD(&fs_devices->alloc_list); | ||
258 | list_add(&fs_devices->list, &fs_uuids); | ||
259 | memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); | ||
260 | fs_devices->latest_devid = devid; | ||
261 | fs_devices->latest_trans = found_transid; | ||
262 | device = NULL; | ||
263 | } else { | ||
264 | device = __find_device(&fs_devices->devices, devid, | ||
265 | disk_super->dev_item.uuid); | ||
266 | } | ||
267 | if (!device) { | ||
268 | if (fs_devices->opened) | ||
269 | return -EBUSY; | ||
270 | |||
271 | device = kzalloc(sizeof(*device), GFP_NOFS); | ||
272 | if (!device) { | ||
273 | /* we can safely leave the fs_devices entry around */ | ||
274 | return -ENOMEM; | ||
275 | } | ||
276 | device->devid = devid; | ||
277 | device->work.func = pending_bios_fn; | ||
278 | memcpy(device->uuid, disk_super->dev_item.uuid, | ||
279 | BTRFS_UUID_SIZE); | ||
280 | device->barriers = 1; | ||
281 | spin_lock_init(&device->io_lock); | ||
282 | device->name = kstrdup(path, GFP_NOFS); | ||
283 | if (!device->name) { | ||
284 | kfree(device); | ||
285 | return -ENOMEM; | ||
286 | } | ||
287 | INIT_LIST_HEAD(&device->dev_alloc_list); | ||
288 | list_add(&device->dev_list, &fs_devices->devices); | ||
289 | device->fs_devices = fs_devices; | ||
290 | fs_devices->num_devices++; | ||
291 | } | ||
292 | |||
293 | if (found_transid > fs_devices->latest_trans) { | ||
294 | fs_devices->latest_devid = devid; | ||
295 | fs_devices->latest_trans = found_transid; | ||
296 | } | ||
297 | *fs_devices_ret = fs_devices; | ||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) | ||
302 | { | ||
303 | struct btrfs_fs_devices *fs_devices; | ||
304 | struct btrfs_device *device; | ||
305 | struct btrfs_device *orig_dev; | ||
306 | |||
307 | fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); | ||
308 | if (!fs_devices) | ||
309 | return ERR_PTR(-ENOMEM); | ||
310 | |||
311 | INIT_LIST_HEAD(&fs_devices->devices); | ||
312 | INIT_LIST_HEAD(&fs_devices->alloc_list); | ||
313 | INIT_LIST_HEAD(&fs_devices->list); | ||
314 | fs_devices->latest_devid = orig->latest_devid; | ||
315 | fs_devices->latest_trans = orig->latest_trans; | ||
316 | memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); | ||
317 | |||
318 | list_for_each_entry(orig_dev, &orig->devices, dev_list) { | ||
319 | device = kzalloc(sizeof(*device), GFP_NOFS); | ||
320 | if (!device) | ||
321 | goto error; | ||
322 | |||
323 | device->name = kstrdup(orig_dev->name, GFP_NOFS); | ||
324 | if (!device->name) | ||
325 | goto error; | ||
326 | |||
327 | device->devid = orig_dev->devid; | ||
328 | device->work.func = pending_bios_fn; | ||
329 | memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); | ||
330 | device->barriers = 1; | ||
331 | spin_lock_init(&device->io_lock); | ||
332 | INIT_LIST_HEAD(&device->dev_list); | ||
333 | INIT_LIST_HEAD(&device->dev_alloc_list); | ||
334 | |||
335 | list_add(&device->dev_list, &fs_devices->devices); | ||
336 | device->fs_devices = fs_devices; | ||
337 | fs_devices->num_devices++; | ||
338 | } | ||
339 | return fs_devices; | ||
340 | error: | ||
341 | free_fs_devices(fs_devices); | ||
342 | return ERR_PTR(-ENOMEM); | ||
343 | } | ||
344 | |||
345 | int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) | ||
346 | { | ||
347 | struct list_head *tmp; | ||
348 | struct list_head *cur; | ||
349 | struct btrfs_device *device; | ||
350 | |||
351 | mutex_lock(&uuid_mutex); | ||
352 | again: | ||
353 | list_for_each_safe(cur, tmp, &fs_devices->devices) { | ||
354 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
355 | if (device->in_fs_metadata) | ||
356 | continue; | ||
357 | |||
358 | if (device->bdev) { | ||
359 | close_bdev_exclusive(device->bdev, device->mode); | ||
360 | device->bdev = NULL; | ||
361 | fs_devices->open_devices--; | ||
362 | } | ||
363 | if (device->writeable) { | ||
364 | list_del_init(&device->dev_alloc_list); | ||
365 | device->writeable = 0; | ||
366 | fs_devices->rw_devices--; | ||
367 | } | ||
368 | list_del_init(&device->dev_list); | ||
369 | fs_devices->num_devices--; | ||
370 | kfree(device->name); | ||
371 | kfree(device); | ||
372 | } | ||
373 | |||
374 | if (fs_devices->seed) { | ||
375 | fs_devices = fs_devices->seed; | ||
376 | goto again; | ||
377 | } | ||
378 | |||
379 | mutex_unlock(&uuid_mutex); | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | ||
384 | { | ||
385 | struct list_head *cur; | ||
386 | struct btrfs_device *device; | ||
387 | |||
388 | if (--fs_devices->opened > 0) | ||
389 | return 0; | ||
390 | |||
391 | list_for_each(cur, &fs_devices->devices) { | ||
392 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
393 | if (device->bdev) { | ||
394 | close_bdev_exclusive(device->bdev, device->mode); | ||
395 | fs_devices->open_devices--; | ||
396 | } | ||
397 | if (device->writeable) { | ||
398 | list_del_init(&device->dev_alloc_list); | ||
399 | fs_devices->rw_devices--; | ||
400 | } | ||
401 | |||
402 | device->bdev = NULL; | ||
403 | device->writeable = 0; | ||
404 | device->in_fs_metadata = 0; | ||
405 | } | ||
406 | WARN_ON(fs_devices->open_devices); | ||
407 | WARN_ON(fs_devices->rw_devices); | ||
408 | fs_devices->opened = 0; | ||
409 | fs_devices->seeding = 0; | ||
410 | |||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | ||
415 | { | ||
416 | struct btrfs_fs_devices *seed_devices = NULL; | ||
417 | int ret; | ||
418 | |||
419 | mutex_lock(&uuid_mutex); | ||
420 | ret = __btrfs_close_devices(fs_devices); | ||
421 | if (!fs_devices->opened) { | ||
422 | seed_devices = fs_devices->seed; | ||
423 | fs_devices->seed = NULL; | ||
424 | } | ||
425 | mutex_unlock(&uuid_mutex); | ||
426 | |||
427 | while (seed_devices) { | ||
428 | fs_devices = seed_devices; | ||
429 | seed_devices = fs_devices->seed; | ||
430 | __btrfs_close_devices(fs_devices); | ||
431 | free_fs_devices(fs_devices); | ||
432 | } | ||
433 | return ret; | ||
434 | } | ||
435 | |||
436 | static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | ||
437 | fmode_t flags, void *holder) | ||
438 | { | ||
439 | struct block_device *bdev; | ||
440 | struct list_head *head = &fs_devices->devices; | ||
441 | struct list_head *cur; | ||
442 | struct btrfs_device *device; | ||
443 | struct block_device *latest_bdev = NULL; | ||
444 | struct buffer_head *bh; | ||
445 | struct btrfs_super_block *disk_super; | ||
446 | u64 latest_devid = 0; | ||
447 | u64 latest_transid = 0; | ||
448 | u64 devid; | ||
449 | int seeding = 1; | ||
450 | int ret = 0; | ||
451 | |||
452 | list_for_each(cur, head) { | ||
453 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
454 | if (device->bdev) | ||
455 | continue; | ||
456 | if (!device->name) | ||
457 | continue; | ||
458 | |||
459 | bdev = open_bdev_exclusive(device->name, flags, holder); | ||
460 | if (IS_ERR(bdev)) { | ||
461 | printk(KERN_INFO "open %s failed\n", device->name); | ||
462 | goto error; | ||
463 | } | ||
464 | set_blocksize(bdev, 4096); | ||
465 | |||
466 | bh = btrfs_read_dev_super(bdev); | ||
467 | if (!bh) | ||
468 | goto error_close; | ||
469 | |||
470 | disk_super = (struct btrfs_super_block *)bh->b_data; | ||
471 | devid = le64_to_cpu(disk_super->dev_item.devid); | ||
472 | if (devid != device->devid) | ||
473 | goto error_brelse; | ||
474 | |||
475 | if (memcmp(device->uuid, disk_super->dev_item.uuid, | ||
476 | BTRFS_UUID_SIZE)) | ||
477 | goto error_brelse; | ||
478 | |||
479 | device->generation = btrfs_super_generation(disk_super); | ||
480 | if (!latest_transid || device->generation > latest_transid) { | ||
481 | latest_devid = devid; | ||
482 | latest_transid = device->generation; | ||
483 | latest_bdev = bdev; | ||
484 | } | ||
485 | |||
486 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { | ||
487 | device->writeable = 0; | ||
488 | } else { | ||
489 | device->writeable = !bdev_read_only(bdev); | ||
490 | seeding = 0; | ||
491 | } | ||
492 | |||
493 | device->bdev = bdev; | ||
494 | device->in_fs_metadata = 0; | ||
495 | device->mode = flags; | ||
496 | |||
497 | fs_devices->open_devices++; | ||
498 | if (device->writeable) { | ||
499 | fs_devices->rw_devices++; | ||
500 | list_add(&device->dev_alloc_list, | ||
501 | &fs_devices->alloc_list); | ||
502 | } | ||
503 | continue; | ||
504 | |||
505 | error_brelse: | ||
506 | brelse(bh); | ||
507 | error_close: | ||
508 | close_bdev_exclusive(bdev, FMODE_READ); | ||
509 | error: | ||
510 | continue; | ||
511 | } | ||
512 | if (fs_devices->open_devices == 0) { | ||
513 | ret = -EIO; | ||
514 | goto out; | ||
515 | } | ||
516 | fs_devices->seeding = seeding; | ||
517 | fs_devices->opened = 1; | ||
518 | fs_devices->latest_bdev = latest_bdev; | ||
519 | fs_devices->latest_devid = latest_devid; | ||
520 | fs_devices->latest_trans = latest_transid; | ||
521 | fs_devices->total_rw_bytes = 0; | ||
522 | out: | ||
523 | return ret; | ||
524 | } | ||
525 | |||
526 | int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | ||
527 | fmode_t flags, void *holder) | ||
528 | { | ||
529 | int ret; | ||
530 | |||
531 | mutex_lock(&uuid_mutex); | ||
532 | if (fs_devices->opened) { | ||
533 | fs_devices->opened++; | ||
534 | ret = 0; | ||
535 | } else { | ||
536 | ret = __btrfs_open_devices(fs_devices, flags, holder); | ||
537 | } | ||
538 | mutex_unlock(&uuid_mutex); | ||
539 | return ret; | ||
540 | } | ||
541 | |||
542 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | ||
543 | struct btrfs_fs_devices **fs_devices_ret) | ||
544 | { | ||
545 | struct btrfs_super_block *disk_super; | ||
546 | struct block_device *bdev; | ||
547 | struct buffer_head *bh; | ||
548 | int ret; | ||
549 | u64 devid; | ||
550 | u64 transid; | ||
551 | |||
552 | mutex_lock(&uuid_mutex); | ||
553 | |||
554 | bdev = open_bdev_exclusive(path, flags, holder); | ||
555 | |||
556 | if (IS_ERR(bdev)) { | ||
557 | ret = PTR_ERR(bdev); | ||
558 | goto error; | ||
559 | } | ||
560 | |||
561 | ret = set_blocksize(bdev, 4096); | ||
562 | if (ret) | ||
563 | goto error_close; | ||
564 | bh = btrfs_read_dev_super(bdev); | ||
565 | if (!bh) { | ||
566 | ret = -EIO; | ||
567 | goto error_close; | ||
568 | } | ||
569 | disk_super = (struct btrfs_super_block *)bh->b_data; | ||
570 | devid = le64_to_cpu(disk_super->dev_item.devid); | ||
571 | transid = btrfs_super_generation(disk_super); | ||
572 | if (disk_super->label[0]) | ||
573 | printk(KERN_INFO "device label %s ", disk_super->label); | ||
574 | else { | ||
575 | /* FIXME, make a readl uuid parser */ | ||
576 | printk(KERN_INFO "device fsid %llx-%llx ", | ||
577 | *(unsigned long long *)disk_super->fsid, | ||
578 | *(unsigned long long *)(disk_super->fsid + 8)); | ||
579 | } | ||
580 | printk(KERN_INFO "devid %llu transid %llu %s\n", | ||
581 | (unsigned long long)devid, (unsigned long long)transid, path); | ||
582 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | ||
583 | |||
584 | brelse(bh); | ||
585 | error_close: | ||
586 | close_bdev_exclusive(bdev, flags); | ||
587 | error: | ||
588 | mutex_unlock(&uuid_mutex); | ||
589 | return ret; | ||
590 | } | ||
591 | |||
592 | /* | ||
593 | * this uses a pretty simple search, the expectation is that it is | ||
594 | * called very infrequently and that a given device has a small number | ||
595 | * of extents | ||
596 | */ | ||
597 | static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, | ||
598 | struct btrfs_device *device, | ||
599 | u64 num_bytes, u64 *start) | ||
600 | { | ||
601 | struct btrfs_key key; | ||
602 | struct btrfs_root *root = device->dev_root; | ||
603 | struct btrfs_dev_extent *dev_extent = NULL; | ||
604 | struct btrfs_path *path; | ||
605 | u64 hole_size = 0; | ||
606 | u64 last_byte = 0; | ||
607 | u64 search_start = 0; | ||
608 | u64 search_end = device->total_bytes; | ||
609 | int ret; | ||
610 | int slot = 0; | ||
611 | int start_found; | ||
612 | struct extent_buffer *l; | ||
613 | |||
614 | path = btrfs_alloc_path(); | ||
615 | if (!path) | ||
616 | return -ENOMEM; | ||
617 | path->reada = 2; | ||
618 | start_found = 0; | ||
619 | |||
620 | /* FIXME use last free of some kind */ | ||
621 | |||
622 | /* we don't want to overwrite the superblock on the drive, | ||
623 | * so we make sure to start at an offset of at least 1MB | ||
624 | */ | ||
625 | search_start = max((u64)1024 * 1024, search_start); | ||
626 | |||
627 | if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) | ||
628 | search_start = max(root->fs_info->alloc_start, search_start); | ||
629 | |||
630 | key.objectid = device->devid; | ||
631 | key.offset = search_start; | ||
632 | key.type = BTRFS_DEV_EXTENT_KEY; | ||
633 | ret = btrfs_search_slot(trans, root, &key, path, 0, 0); | ||
634 | if (ret < 0) | ||
635 | goto error; | ||
636 | ret = btrfs_previous_item(root, path, 0, key.type); | ||
637 | if (ret < 0) | ||
638 | goto error; | ||
639 | l = path->nodes[0]; | ||
640 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | ||
641 | while (1) { | ||
642 | l = path->nodes[0]; | ||
643 | slot = path->slots[0]; | ||
644 | if (slot >= btrfs_header_nritems(l)) { | ||
645 | ret = btrfs_next_leaf(root, path); | ||
646 | if (ret == 0) | ||
647 | continue; | ||
648 | if (ret < 0) | ||
649 | goto error; | ||
650 | no_more_items: | ||
651 | if (!start_found) { | ||
652 | if (search_start >= search_end) { | ||
653 | ret = -ENOSPC; | ||
654 | goto error; | ||
655 | } | ||
656 | *start = search_start; | ||
657 | start_found = 1; | ||
658 | goto check_pending; | ||
659 | } | ||
660 | *start = last_byte > search_start ? | ||
661 | last_byte : search_start; | ||
662 | if (search_end <= *start) { | ||
663 | ret = -ENOSPC; | ||
664 | goto error; | ||
665 | } | ||
666 | goto check_pending; | ||
667 | } | ||
668 | btrfs_item_key_to_cpu(l, &key, slot); | ||
669 | |||
670 | if (key.objectid < device->devid) | ||
671 | goto next; | ||
672 | |||
673 | if (key.objectid > device->devid) | ||
674 | goto no_more_items; | ||
675 | |||
676 | if (key.offset >= search_start && key.offset > last_byte && | ||
677 | start_found) { | ||
678 | if (last_byte < search_start) | ||
679 | last_byte = search_start; | ||
680 | hole_size = key.offset - last_byte; | ||
681 | if (key.offset > last_byte && | ||
682 | hole_size >= num_bytes) { | ||
683 | *start = last_byte; | ||
684 | goto check_pending; | ||
685 | } | ||
686 | } | ||
687 | if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) | ||
688 | goto next; | ||
689 | |||
690 | start_found = 1; | ||
691 | dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); | ||
692 | last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); | ||
693 | next: | ||
694 | path->slots[0]++; | ||
695 | cond_resched(); | ||
696 | } | ||
697 | check_pending: | ||
698 | /* we have to make sure we didn't find an extent that has already | ||
699 | * been allocated by the map tree or the original allocation | ||
700 | */ | ||
701 | BUG_ON(*start < search_start); | ||
702 | |||
703 | if (*start + num_bytes > search_end) { | ||
704 | ret = -ENOSPC; | ||
705 | goto error; | ||
706 | } | ||
707 | /* check for pending inserts here */ | ||
708 | ret = 0; | ||
709 | |||
710 | error: | ||
711 | btrfs_free_path(path); | ||
712 | return ret; | ||
713 | } | ||
714 | |||
715 | static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | ||
716 | struct btrfs_device *device, | ||
717 | u64 start) | ||
718 | { | ||
719 | int ret; | ||
720 | struct btrfs_path *path; | ||
721 | struct btrfs_root *root = device->dev_root; | ||
722 | struct btrfs_key key; | ||
723 | struct btrfs_key found_key; | ||
724 | struct extent_buffer *leaf = NULL; | ||
725 | struct btrfs_dev_extent *extent = NULL; | ||
726 | |||
727 | path = btrfs_alloc_path(); | ||
728 | if (!path) | ||
729 | return -ENOMEM; | ||
730 | |||
731 | key.objectid = device->devid; | ||
732 | key.offset = start; | ||
733 | key.type = BTRFS_DEV_EXTENT_KEY; | ||
734 | |||
735 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
736 | if (ret > 0) { | ||
737 | ret = btrfs_previous_item(root, path, key.objectid, | ||
738 | BTRFS_DEV_EXTENT_KEY); | ||
739 | BUG_ON(ret); | ||
740 | leaf = path->nodes[0]; | ||
741 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
742 | extent = btrfs_item_ptr(leaf, path->slots[0], | ||
743 | struct btrfs_dev_extent); | ||
744 | BUG_ON(found_key.offset > start || found_key.offset + | ||
745 | btrfs_dev_extent_length(leaf, extent) < start); | ||
746 | ret = 0; | ||
747 | } else if (ret == 0) { | ||
748 | leaf = path->nodes[0]; | ||
749 | extent = btrfs_item_ptr(leaf, path->slots[0], | ||
750 | struct btrfs_dev_extent); | ||
751 | } | ||
752 | BUG_ON(ret); | ||
753 | |||
754 | if (device->bytes_used > 0) | ||
755 | device->bytes_used -= btrfs_dev_extent_length(leaf, extent); | ||
756 | ret = btrfs_del_item(trans, root, path); | ||
757 | BUG_ON(ret); | ||
758 | |||
759 | btrfs_free_path(path); | ||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | ||
764 | struct btrfs_device *device, | ||
765 | u64 chunk_tree, u64 chunk_objectid, | ||
766 | u64 chunk_offset, u64 start, u64 num_bytes) | ||
767 | { | ||
768 | int ret; | ||
769 | struct btrfs_path *path; | ||
770 | struct btrfs_root *root = device->dev_root; | ||
771 | struct btrfs_dev_extent *extent; | ||
772 | struct extent_buffer *leaf; | ||
773 | struct btrfs_key key; | ||
774 | |||
775 | WARN_ON(!device->in_fs_metadata); | ||
776 | path = btrfs_alloc_path(); | ||
777 | if (!path) | ||
778 | return -ENOMEM; | ||
779 | |||
780 | key.objectid = device->devid; | ||
781 | key.offset = start; | ||
782 | key.type = BTRFS_DEV_EXTENT_KEY; | ||
783 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
784 | sizeof(*extent)); | ||
785 | BUG_ON(ret); | ||
786 | |||
787 | leaf = path->nodes[0]; | ||
788 | extent = btrfs_item_ptr(leaf, path->slots[0], | ||
789 | struct btrfs_dev_extent); | ||
790 | btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); | ||
791 | btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); | ||
792 | btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); | ||
793 | |||
794 | write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, | ||
795 | (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), | ||
796 | BTRFS_UUID_SIZE); | ||
797 | |||
798 | btrfs_set_dev_extent_length(leaf, extent, num_bytes); | ||
799 | btrfs_mark_buffer_dirty(leaf); | ||
800 | btrfs_free_path(path); | ||
801 | return ret; | ||
802 | } | ||
803 | |||
804 | static noinline int find_next_chunk(struct btrfs_root *root, | ||
805 | u64 objectid, u64 *offset) | ||
806 | { | ||
807 | struct btrfs_path *path; | ||
808 | int ret; | ||
809 | struct btrfs_key key; | ||
810 | struct btrfs_chunk *chunk; | ||
811 | struct btrfs_key found_key; | ||
812 | |||
813 | path = btrfs_alloc_path(); | ||
814 | BUG_ON(!path); | ||
815 | |||
816 | key.objectid = objectid; | ||
817 | key.offset = (u64)-1; | ||
818 | key.type = BTRFS_CHUNK_ITEM_KEY; | ||
819 | |||
820 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
821 | if (ret < 0) | ||
822 | goto error; | ||
823 | |||
824 | BUG_ON(ret == 0); | ||
825 | |||
826 | ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); | ||
827 | if (ret) { | ||
828 | *offset = 0; | ||
829 | } else { | ||
830 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
831 | path->slots[0]); | ||
832 | if (found_key.objectid != objectid) | ||
833 | *offset = 0; | ||
834 | else { | ||
835 | chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
836 | struct btrfs_chunk); | ||
837 | *offset = found_key.offset + | ||
838 | btrfs_chunk_length(path->nodes[0], chunk); | ||
839 | } | ||
840 | } | ||
841 | ret = 0; | ||
842 | error: | ||
843 | btrfs_free_path(path); | ||
844 | return ret; | ||
845 | } | ||
846 | |||
847 | static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) | ||
848 | { | ||
849 | int ret; | ||
850 | struct btrfs_key key; | ||
851 | struct btrfs_key found_key; | ||
852 | struct btrfs_path *path; | ||
853 | |||
854 | root = root->fs_info->chunk_root; | ||
855 | |||
856 | path = btrfs_alloc_path(); | ||
857 | if (!path) | ||
858 | return -ENOMEM; | ||
859 | |||
860 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
861 | key.type = BTRFS_DEV_ITEM_KEY; | ||
862 | key.offset = (u64)-1; | ||
863 | |||
864 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
865 | if (ret < 0) | ||
866 | goto error; | ||
867 | |||
868 | BUG_ON(ret == 0); | ||
869 | |||
870 | ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, | ||
871 | BTRFS_DEV_ITEM_KEY); | ||
872 | if (ret) { | ||
873 | *objectid = 1; | ||
874 | } else { | ||
875 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
876 | path->slots[0]); | ||
877 | *objectid = found_key.offset + 1; | ||
878 | } | ||
879 | ret = 0; | ||
880 | error: | ||
881 | btrfs_free_path(path); | ||
882 | return ret; | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * the device information is stored in the chunk root | ||
887 | * the btrfs_device struct should be fully filled in | ||
888 | */ | ||
889 | int btrfs_add_device(struct btrfs_trans_handle *trans, | ||
890 | struct btrfs_root *root, | ||
891 | struct btrfs_device *device) | ||
892 | { | ||
893 | int ret; | ||
894 | struct btrfs_path *path; | ||
895 | struct btrfs_dev_item *dev_item; | ||
896 | struct extent_buffer *leaf; | ||
897 | struct btrfs_key key; | ||
898 | unsigned long ptr; | ||
899 | |||
900 | root = root->fs_info->chunk_root; | ||
901 | |||
902 | path = btrfs_alloc_path(); | ||
903 | if (!path) | ||
904 | return -ENOMEM; | ||
905 | |||
906 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
907 | key.type = BTRFS_DEV_ITEM_KEY; | ||
908 | key.offset = device->devid; | ||
909 | |||
910 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
911 | sizeof(*dev_item)); | ||
912 | if (ret) | ||
913 | goto out; | ||
914 | |||
915 | leaf = path->nodes[0]; | ||
916 | dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); | ||
917 | |||
918 | btrfs_set_device_id(leaf, dev_item, device->devid); | ||
919 | btrfs_set_device_generation(leaf, dev_item, 0); | ||
920 | btrfs_set_device_type(leaf, dev_item, device->type); | ||
921 | btrfs_set_device_io_align(leaf, dev_item, device->io_align); | ||
922 | btrfs_set_device_io_width(leaf, dev_item, device->io_width); | ||
923 | btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); | ||
924 | btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); | ||
925 | btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); | ||
926 | btrfs_set_device_group(leaf, dev_item, 0); | ||
927 | btrfs_set_device_seek_speed(leaf, dev_item, 0); | ||
928 | btrfs_set_device_bandwidth(leaf, dev_item, 0); | ||
929 | btrfs_set_device_start_offset(leaf, dev_item, 0); | ||
930 | |||
931 | ptr = (unsigned long)btrfs_device_uuid(dev_item); | ||
932 | write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); | ||
933 | ptr = (unsigned long)btrfs_device_fsid(dev_item); | ||
934 | write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); | ||
935 | btrfs_mark_buffer_dirty(leaf); | ||
936 | |||
937 | ret = 0; | ||
938 | out: | ||
939 | btrfs_free_path(path); | ||
940 | return ret; | ||
941 | } | ||
942 | |||
943 | static int btrfs_rm_dev_item(struct btrfs_root *root, | ||
944 | struct btrfs_device *device) | ||
945 | { | ||
946 | int ret; | ||
947 | struct btrfs_path *path; | ||
948 | struct btrfs_key key; | ||
949 | struct btrfs_trans_handle *trans; | ||
950 | |||
951 | root = root->fs_info->chunk_root; | ||
952 | |||
953 | path = btrfs_alloc_path(); | ||
954 | if (!path) | ||
955 | return -ENOMEM; | ||
956 | |||
957 | trans = btrfs_start_transaction(root, 1); | ||
958 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
959 | key.type = BTRFS_DEV_ITEM_KEY; | ||
960 | key.offset = device->devid; | ||
961 | lock_chunks(root); | ||
962 | |||
963 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
964 | if (ret < 0) | ||
965 | goto out; | ||
966 | |||
967 | if (ret > 0) { | ||
968 | ret = -ENOENT; | ||
969 | goto out; | ||
970 | } | ||
971 | |||
972 | ret = btrfs_del_item(trans, root, path); | ||
973 | if (ret) | ||
974 | goto out; | ||
975 | out: | ||
976 | btrfs_free_path(path); | ||
977 | unlock_chunks(root); | ||
978 | btrfs_commit_transaction(trans, root); | ||
979 | return ret; | ||
980 | } | ||
981 | |||
982 | int btrfs_rm_device(struct btrfs_root *root, char *device_path) | ||
983 | { | ||
984 | struct btrfs_device *device; | ||
985 | struct btrfs_device *next_device; | ||
986 | struct block_device *bdev; | ||
987 | struct buffer_head *bh = NULL; | ||
988 | struct btrfs_super_block *disk_super; | ||
989 | u64 all_avail; | ||
990 | u64 devid; | ||
991 | u64 num_devices; | ||
992 | u8 *dev_uuid; | ||
993 | int ret = 0; | ||
994 | |||
995 | mutex_lock(&uuid_mutex); | ||
996 | mutex_lock(&root->fs_info->volume_mutex); | ||
997 | |||
998 | all_avail = root->fs_info->avail_data_alloc_bits | | ||
999 | root->fs_info->avail_system_alloc_bits | | ||
1000 | root->fs_info->avail_metadata_alloc_bits; | ||
1001 | |||
1002 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && | ||
1003 | root->fs_info->fs_devices->rw_devices <= 4) { | ||
1004 | printk(KERN_ERR "btrfs: unable to go below four devices " | ||
1005 | "on raid10\n"); | ||
1006 | ret = -EINVAL; | ||
1007 | goto out; | ||
1008 | } | ||
1009 | |||
1010 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && | ||
1011 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1012 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1013 | "devices on raid1\n"); | ||
1014 | ret = -EINVAL; | ||
1015 | goto out; | ||
1016 | } | ||
1017 | |||
1018 | if (strcmp(device_path, "missing") == 0) { | ||
1019 | struct list_head *cur; | ||
1020 | struct list_head *devices; | ||
1021 | struct btrfs_device *tmp; | ||
1022 | |||
1023 | device = NULL; | ||
1024 | devices = &root->fs_info->fs_devices->devices; | ||
1025 | list_for_each(cur, devices) { | ||
1026 | tmp = list_entry(cur, struct btrfs_device, dev_list); | ||
1027 | if (tmp->in_fs_metadata && !tmp->bdev) { | ||
1028 | device = tmp; | ||
1029 | break; | ||
1030 | } | ||
1031 | } | ||
1032 | bdev = NULL; | ||
1033 | bh = NULL; | ||
1034 | disk_super = NULL; | ||
1035 | if (!device) { | ||
1036 | printk(KERN_ERR "btrfs: no missing devices found to " | ||
1037 | "remove\n"); | ||
1038 | goto out; | ||
1039 | } | ||
1040 | } else { | ||
1041 | bdev = open_bdev_exclusive(device_path, FMODE_READ, | ||
1042 | root->fs_info->bdev_holder); | ||
1043 | if (IS_ERR(bdev)) { | ||
1044 | ret = PTR_ERR(bdev); | ||
1045 | goto out; | ||
1046 | } | ||
1047 | |||
1048 | set_blocksize(bdev, 4096); | ||
1049 | bh = btrfs_read_dev_super(bdev); | ||
1050 | if (!bh) { | ||
1051 | ret = -EIO; | ||
1052 | goto error_close; | ||
1053 | } | ||
1054 | disk_super = (struct btrfs_super_block *)bh->b_data; | ||
1055 | devid = le64_to_cpu(disk_super->dev_item.devid); | ||
1056 | dev_uuid = disk_super->dev_item.uuid; | ||
1057 | device = btrfs_find_device(root, devid, dev_uuid, | ||
1058 | disk_super->fsid); | ||
1059 | if (!device) { | ||
1060 | ret = -ENOENT; | ||
1061 | goto error_brelse; | ||
1062 | } | ||
1063 | } | ||
1064 | |||
1065 | if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { | ||
1066 | printk(KERN_ERR "btrfs: unable to remove the only writeable " | ||
1067 | "device\n"); | ||
1068 | ret = -EINVAL; | ||
1069 | goto error_brelse; | ||
1070 | } | ||
1071 | |||
1072 | if (device->writeable) { | ||
1073 | list_del_init(&device->dev_alloc_list); | ||
1074 | root->fs_info->fs_devices->rw_devices--; | ||
1075 | } | ||
1076 | |||
1077 | ret = btrfs_shrink_device(device, 0); | ||
1078 | if (ret) | ||
1079 | goto error_brelse; | ||
1080 | |||
1081 | ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); | ||
1082 | if (ret) | ||
1083 | goto error_brelse; | ||
1084 | |||
1085 | device->in_fs_metadata = 0; | ||
1086 | list_del_init(&device->dev_list); | ||
1087 | device->fs_devices->num_devices--; | ||
1088 | |||
1089 | next_device = list_entry(root->fs_info->fs_devices->devices.next, | ||
1090 | struct btrfs_device, dev_list); | ||
1091 | if (device->bdev == root->fs_info->sb->s_bdev) | ||
1092 | root->fs_info->sb->s_bdev = next_device->bdev; | ||
1093 | if (device->bdev == root->fs_info->fs_devices->latest_bdev) | ||
1094 | root->fs_info->fs_devices->latest_bdev = next_device->bdev; | ||
1095 | |||
1096 | if (device->bdev) { | ||
1097 | close_bdev_exclusive(device->bdev, device->mode); | ||
1098 | device->bdev = NULL; | ||
1099 | device->fs_devices->open_devices--; | ||
1100 | } | ||
1101 | |||
1102 | num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; | ||
1103 | btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); | ||
1104 | |||
1105 | if (device->fs_devices->open_devices == 0) { | ||
1106 | struct btrfs_fs_devices *fs_devices; | ||
1107 | fs_devices = root->fs_info->fs_devices; | ||
1108 | while (fs_devices) { | ||
1109 | if (fs_devices->seed == device->fs_devices) | ||
1110 | break; | ||
1111 | fs_devices = fs_devices->seed; | ||
1112 | } | ||
1113 | fs_devices->seed = device->fs_devices->seed; | ||
1114 | device->fs_devices->seed = NULL; | ||
1115 | __btrfs_close_devices(device->fs_devices); | ||
1116 | free_fs_devices(device->fs_devices); | ||
1117 | } | ||
1118 | |||
1119 | /* | ||
1120 | * at this point, the device is zero sized. We want to | ||
1121 | * remove it from the devices list and zero out the old super | ||
1122 | */ | ||
1123 | if (device->writeable) { | ||
1124 | /* make sure this device isn't detected as part of | ||
1125 | * the FS anymore | ||
1126 | */ | ||
1127 | memset(&disk_super->magic, 0, sizeof(disk_super->magic)); | ||
1128 | set_buffer_dirty(bh); | ||
1129 | sync_dirty_buffer(bh); | ||
1130 | } | ||
1131 | |||
1132 | kfree(device->name); | ||
1133 | kfree(device); | ||
1134 | ret = 0; | ||
1135 | |||
1136 | error_brelse: | ||
1137 | brelse(bh); | ||
1138 | error_close: | ||
1139 | if (bdev) | ||
1140 | close_bdev_exclusive(bdev, FMODE_READ); | ||
1141 | out: | ||
1142 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1143 | mutex_unlock(&uuid_mutex); | ||
1144 | return ret; | ||
1145 | } | ||
1146 | |||
1147 | /* | ||
1148 | * does all the dirty work required for changing file system's UUID. | ||
1149 | */ | ||
1150 | static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, | ||
1151 | struct btrfs_root *root) | ||
1152 | { | ||
1153 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; | ||
1154 | struct btrfs_fs_devices *old_devices; | ||
1155 | struct btrfs_fs_devices *seed_devices; | ||
1156 | struct btrfs_super_block *disk_super = &root->fs_info->super_copy; | ||
1157 | struct btrfs_device *device; | ||
1158 | u64 super_flags; | ||
1159 | |||
1160 | BUG_ON(!mutex_is_locked(&uuid_mutex)); | ||
1161 | if (!fs_devices->seeding) | ||
1162 | return -EINVAL; | ||
1163 | |||
1164 | seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); | ||
1165 | if (!seed_devices) | ||
1166 | return -ENOMEM; | ||
1167 | |||
1168 | old_devices = clone_fs_devices(fs_devices); | ||
1169 | if (IS_ERR(old_devices)) { | ||
1170 | kfree(seed_devices); | ||
1171 | return PTR_ERR(old_devices); | ||
1172 | } | ||
1173 | |||
1174 | list_add(&old_devices->list, &fs_uuids); | ||
1175 | |||
1176 | memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); | ||
1177 | seed_devices->opened = 1; | ||
1178 | INIT_LIST_HEAD(&seed_devices->devices); | ||
1179 | INIT_LIST_HEAD(&seed_devices->alloc_list); | ||
1180 | list_splice_init(&fs_devices->devices, &seed_devices->devices); | ||
1181 | list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); | ||
1182 | list_for_each_entry(device, &seed_devices->devices, dev_list) { | ||
1183 | device->fs_devices = seed_devices; | ||
1184 | } | ||
1185 | |||
1186 | fs_devices->seeding = 0; | ||
1187 | fs_devices->num_devices = 0; | ||
1188 | fs_devices->open_devices = 0; | ||
1189 | fs_devices->seed = seed_devices; | ||
1190 | |||
1191 | generate_random_uuid(fs_devices->fsid); | ||
1192 | memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); | ||
1193 | memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); | ||
1194 | super_flags = btrfs_super_flags(disk_super) & | ||
1195 | ~BTRFS_SUPER_FLAG_SEEDING; | ||
1196 | btrfs_set_super_flags(disk_super, super_flags); | ||
1197 | |||
1198 | return 0; | ||
1199 | } | ||
1200 | |||
1201 | /* | ||
1202 | * strore the expected generation for seed devices in device items. | ||
1203 | */ | ||
1204 | static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, | ||
1205 | struct btrfs_root *root) | ||
1206 | { | ||
1207 | struct btrfs_path *path; | ||
1208 | struct extent_buffer *leaf; | ||
1209 | struct btrfs_dev_item *dev_item; | ||
1210 | struct btrfs_device *device; | ||
1211 | struct btrfs_key key; | ||
1212 | u8 fs_uuid[BTRFS_UUID_SIZE]; | ||
1213 | u8 dev_uuid[BTRFS_UUID_SIZE]; | ||
1214 | u64 devid; | ||
1215 | int ret; | ||
1216 | |||
1217 | path = btrfs_alloc_path(); | ||
1218 | if (!path) | ||
1219 | return -ENOMEM; | ||
1220 | |||
1221 | root = root->fs_info->chunk_root; | ||
1222 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
1223 | key.offset = 0; | ||
1224 | key.type = BTRFS_DEV_ITEM_KEY; | ||
1225 | |||
1226 | while (1) { | ||
1227 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | ||
1228 | if (ret < 0) | ||
1229 | goto error; | ||
1230 | |||
1231 | leaf = path->nodes[0]; | ||
1232 | next_slot: | ||
1233 | if (path->slots[0] >= btrfs_header_nritems(leaf)) { | ||
1234 | ret = btrfs_next_leaf(root, path); | ||
1235 | if (ret > 0) | ||
1236 | break; | ||
1237 | if (ret < 0) | ||
1238 | goto error; | ||
1239 | leaf = path->nodes[0]; | ||
1240 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
1241 | btrfs_release_path(root, path); | ||
1242 | continue; | ||
1243 | } | ||
1244 | |||
1245 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | ||
1246 | if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || | ||
1247 | key.type != BTRFS_DEV_ITEM_KEY) | ||
1248 | break; | ||
1249 | |||
1250 | dev_item = btrfs_item_ptr(leaf, path->slots[0], | ||
1251 | struct btrfs_dev_item); | ||
1252 | devid = btrfs_device_id(leaf, dev_item); | ||
1253 | read_extent_buffer(leaf, dev_uuid, | ||
1254 | (unsigned long)btrfs_device_uuid(dev_item), | ||
1255 | BTRFS_UUID_SIZE); | ||
1256 | read_extent_buffer(leaf, fs_uuid, | ||
1257 | (unsigned long)btrfs_device_fsid(dev_item), | ||
1258 | BTRFS_UUID_SIZE); | ||
1259 | device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); | ||
1260 | BUG_ON(!device); | ||
1261 | |||
1262 | if (device->fs_devices->seeding) { | ||
1263 | btrfs_set_device_generation(leaf, dev_item, | ||
1264 | device->generation); | ||
1265 | btrfs_mark_buffer_dirty(leaf); | ||
1266 | } | ||
1267 | |||
1268 | path->slots[0]++; | ||
1269 | goto next_slot; | ||
1270 | } | ||
1271 | ret = 0; | ||
1272 | error: | ||
1273 | btrfs_free_path(path); | ||
1274 | return ret; | ||
1275 | } | ||
1276 | |||
1277 | int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | ||
1278 | { | ||
1279 | struct btrfs_trans_handle *trans; | ||
1280 | struct btrfs_device *device; | ||
1281 | struct block_device *bdev; | ||
1282 | struct list_head *cur; | ||
1283 | struct list_head *devices; | ||
1284 | struct super_block *sb = root->fs_info->sb; | ||
1285 | u64 total_bytes; | ||
1286 | int seeding_dev = 0; | ||
1287 | int ret = 0; | ||
1288 | |||
1289 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) | ||
1290 | return -EINVAL; | ||
1291 | |||
1292 | bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); | ||
1293 | if (!bdev) | ||
1294 | return -EIO; | ||
1295 | |||
1296 | if (root->fs_info->fs_devices->seeding) { | ||
1297 | seeding_dev = 1; | ||
1298 | down_write(&sb->s_umount); | ||
1299 | mutex_lock(&uuid_mutex); | ||
1300 | } | ||
1301 | |||
1302 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | ||
1303 | mutex_lock(&root->fs_info->volume_mutex); | ||
1304 | |||
1305 | devices = &root->fs_info->fs_devices->devices; | ||
1306 | list_for_each(cur, devices) { | ||
1307 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
1308 | if (device->bdev == bdev) { | ||
1309 | ret = -EEXIST; | ||
1310 | goto error; | ||
1311 | } | ||
1312 | } | ||
1313 | |||
1314 | device = kzalloc(sizeof(*device), GFP_NOFS); | ||
1315 | if (!device) { | ||
1316 | /* we can safely leave the fs_devices entry around */ | ||
1317 | ret = -ENOMEM; | ||
1318 | goto error; | ||
1319 | } | ||
1320 | |||
1321 | device->name = kstrdup(device_path, GFP_NOFS); | ||
1322 | if (!device->name) { | ||
1323 | kfree(device); | ||
1324 | ret = -ENOMEM; | ||
1325 | goto error; | ||
1326 | } | ||
1327 | |||
1328 | ret = find_next_devid(root, &device->devid); | ||
1329 | if (ret) { | ||
1330 | kfree(device); | ||
1331 | goto error; | ||
1332 | } | ||
1333 | |||
1334 | trans = btrfs_start_transaction(root, 1); | ||
1335 | lock_chunks(root); | ||
1336 | |||
1337 | device->barriers = 1; | ||
1338 | device->writeable = 1; | ||
1339 | device->work.func = pending_bios_fn; | ||
1340 | generate_random_uuid(device->uuid); | ||
1341 | spin_lock_init(&device->io_lock); | ||
1342 | device->generation = trans->transid; | ||
1343 | device->io_width = root->sectorsize; | ||
1344 | device->io_align = root->sectorsize; | ||
1345 | device->sector_size = root->sectorsize; | ||
1346 | device->total_bytes = i_size_read(bdev->bd_inode); | ||
1347 | device->dev_root = root->fs_info->dev_root; | ||
1348 | device->bdev = bdev; | ||
1349 | device->in_fs_metadata = 1; | ||
1350 | device->mode = 0; | ||
1351 | set_blocksize(device->bdev, 4096); | ||
1352 | |||
1353 | if (seeding_dev) { | ||
1354 | sb->s_flags &= ~MS_RDONLY; | ||
1355 | ret = btrfs_prepare_sprout(trans, root); | ||
1356 | BUG_ON(ret); | ||
1357 | } | ||
1358 | |||
1359 | device->fs_devices = root->fs_info->fs_devices; | ||
1360 | list_add(&device->dev_list, &root->fs_info->fs_devices->devices); | ||
1361 | list_add(&device->dev_alloc_list, | ||
1362 | &root->fs_info->fs_devices->alloc_list); | ||
1363 | root->fs_info->fs_devices->num_devices++; | ||
1364 | root->fs_info->fs_devices->open_devices++; | ||
1365 | root->fs_info->fs_devices->rw_devices++; | ||
1366 | root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; | ||
1367 | |||
1368 | total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); | ||
1369 | btrfs_set_super_total_bytes(&root->fs_info->super_copy, | ||
1370 | total_bytes + device->total_bytes); | ||
1371 | |||
1372 | total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); | ||
1373 | btrfs_set_super_num_devices(&root->fs_info->super_copy, | ||
1374 | total_bytes + 1); | ||
1375 | |||
1376 | if (seeding_dev) { | ||
1377 | ret = init_first_rw_device(trans, root, device); | ||
1378 | BUG_ON(ret); | ||
1379 | ret = btrfs_finish_sprout(trans, root); | ||
1380 | BUG_ON(ret); | ||
1381 | } else { | ||
1382 | ret = btrfs_add_device(trans, root, device); | ||
1383 | } | ||
1384 | |||
1385 | unlock_chunks(root); | ||
1386 | btrfs_commit_transaction(trans, root); | ||
1387 | |||
1388 | if (seeding_dev) { | ||
1389 | mutex_unlock(&uuid_mutex); | ||
1390 | up_write(&sb->s_umount); | ||
1391 | |||
1392 | ret = btrfs_relocate_sys_chunks(root); | ||
1393 | BUG_ON(ret); | ||
1394 | } | ||
1395 | out: | ||
1396 | mutex_unlock(&root->fs_info->volume_mutex); | ||
1397 | return ret; | ||
1398 | error: | ||
1399 | close_bdev_exclusive(bdev, 0); | ||
1400 | if (seeding_dev) { | ||
1401 | mutex_unlock(&uuid_mutex); | ||
1402 | up_write(&sb->s_umount); | ||
1403 | } | ||
1404 | goto out; | ||
1405 | } | ||
1406 | |||
1407 | static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, | ||
1408 | struct btrfs_device *device) | ||
1409 | { | ||
1410 | int ret; | ||
1411 | struct btrfs_path *path; | ||
1412 | struct btrfs_root *root; | ||
1413 | struct btrfs_dev_item *dev_item; | ||
1414 | struct extent_buffer *leaf; | ||
1415 | struct btrfs_key key; | ||
1416 | |||
1417 | root = device->dev_root->fs_info->chunk_root; | ||
1418 | |||
1419 | path = btrfs_alloc_path(); | ||
1420 | if (!path) | ||
1421 | return -ENOMEM; | ||
1422 | |||
1423 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
1424 | key.type = BTRFS_DEV_ITEM_KEY; | ||
1425 | key.offset = device->devid; | ||
1426 | |||
1427 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | ||
1428 | if (ret < 0) | ||
1429 | goto out; | ||
1430 | |||
1431 | if (ret > 0) { | ||
1432 | ret = -ENOENT; | ||
1433 | goto out; | ||
1434 | } | ||
1435 | |||
1436 | leaf = path->nodes[0]; | ||
1437 | dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); | ||
1438 | |||
1439 | btrfs_set_device_id(leaf, dev_item, device->devid); | ||
1440 | btrfs_set_device_type(leaf, dev_item, device->type); | ||
1441 | btrfs_set_device_io_align(leaf, dev_item, device->io_align); | ||
1442 | btrfs_set_device_io_width(leaf, dev_item, device->io_width); | ||
1443 | btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); | ||
1444 | btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); | ||
1445 | btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); | ||
1446 | btrfs_mark_buffer_dirty(leaf); | ||
1447 | |||
1448 | out: | ||
1449 | btrfs_free_path(path); | ||
1450 | return ret; | ||
1451 | } | ||
1452 | |||
1453 | static int __btrfs_grow_device(struct btrfs_trans_handle *trans, | ||
1454 | struct btrfs_device *device, u64 new_size) | ||
1455 | { | ||
1456 | struct btrfs_super_block *super_copy = | ||
1457 | &device->dev_root->fs_info->super_copy; | ||
1458 | u64 old_total = btrfs_super_total_bytes(super_copy); | ||
1459 | u64 diff = new_size - device->total_bytes; | ||
1460 | |||
1461 | if (!device->writeable) | ||
1462 | return -EACCES; | ||
1463 | if (new_size <= device->total_bytes) | ||
1464 | return -EINVAL; | ||
1465 | |||
1466 | btrfs_set_super_total_bytes(super_copy, old_total + diff); | ||
1467 | device->fs_devices->total_rw_bytes += diff; | ||
1468 | |||
1469 | device->total_bytes = new_size; | ||
1470 | return btrfs_update_device(trans, device); | ||
1471 | } | ||
1472 | |||
1473 | int btrfs_grow_device(struct btrfs_trans_handle *trans, | ||
1474 | struct btrfs_device *device, u64 new_size) | ||
1475 | { | ||
1476 | int ret; | ||
1477 | lock_chunks(device->dev_root); | ||
1478 | ret = __btrfs_grow_device(trans, device, new_size); | ||
1479 | unlock_chunks(device->dev_root); | ||
1480 | return ret; | ||
1481 | } | ||
1482 | |||
1483 | static int btrfs_free_chunk(struct btrfs_trans_handle *trans, | ||
1484 | struct btrfs_root *root, | ||
1485 | u64 chunk_tree, u64 chunk_objectid, | ||
1486 | u64 chunk_offset) | ||
1487 | { | ||
1488 | int ret; | ||
1489 | struct btrfs_path *path; | ||
1490 | struct btrfs_key key; | ||
1491 | |||
1492 | root = root->fs_info->chunk_root; | ||
1493 | path = btrfs_alloc_path(); | ||
1494 | if (!path) | ||
1495 | return -ENOMEM; | ||
1496 | |||
1497 | key.objectid = chunk_objectid; | ||
1498 | key.offset = chunk_offset; | ||
1499 | key.type = BTRFS_CHUNK_ITEM_KEY; | ||
1500 | |||
1501 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
1502 | BUG_ON(ret); | ||
1503 | |||
1504 | ret = btrfs_del_item(trans, root, path); | ||
1505 | BUG_ON(ret); | ||
1506 | |||
1507 | btrfs_free_path(path); | ||
1508 | return 0; | ||
1509 | } | ||
1510 | |||
1511 | static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 | ||
1512 | chunk_offset) | ||
1513 | { | ||
1514 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | ||
1515 | struct btrfs_disk_key *disk_key; | ||
1516 | struct btrfs_chunk *chunk; | ||
1517 | u8 *ptr; | ||
1518 | int ret = 0; | ||
1519 | u32 num_stripes; | ||
1520 | u32 array_size; | ||
1521 | u32 len = 0; | ||
1522 | u32 cur; | ||
1523 | struct btrfs_key key; | ||
1524 | |||
1525 | array_size = btrfs_super_sys_array_size(super_copy); | ||
1526 | |||
1527 | ptr = super_copy->sys_chunk_array; | ||
1528 | cur = 0; | ||
1529 | |||
1530 | while (cur < array_size) { | ||
1531 | disk_key = (struct btrfs_disk_key *)ptr; | ||
1532 | btrfs_disk_key_to_cpu(&key, disk_key); | ||
1533 | |||
1534 | len = sizeof(*disk_key); | ||
1535 | |||
1536 | if (key.type == BTRFS_CHUNK_ITEM_KEY) { | ||
1537 | chunk = (struct btrfs_chunk *)(ptr + len); | ||
1538 | num_stripes = btrfs_stack_chunk_num_stripes(chunk); | ||
1539 | len += btrfs_chunk_item_size(num_stripes); | ||
1540 | } else { | ||
1541 | ret = -EIO; | ||
1542 | break; | ||
1543 | } | ||
1544 | if (key.objectid == chunk_objectid && | ||
1545 | key.offset == chunk_offset) { | ||
1546 | memmove(ptr, ptr + len, array_size - (cur + len)); | ||
1547 | array_size -= len; | ||
1548 | btrfs_set_super_sys_array_size(super_copy, array_size); | ||
1549 | } else { | ||
1550 | ptr += len; | ||
1551 | cur += len; | ||
1552 | } | ||
1553 | } | ||
1554 | return ret; | ||
1555 | } | ||
1556 | |||
1557 | static int btrfs_relocate_chunk(struct btrfs_root *root, | ||
1558 | u64 chunk_tree, u64 chunk_objectid, | ||
1559 | u64 chunk_offset) | ||
1560 | { | ||
1561 | struct extent_map_tree *em_tree; | ||
1562 | struct btrfs_root *extent_root; | ||
1563 | struct btrfs_trans_handle *trans; | ||
1564 | struct extent_map *em; | ||
1565 | struct map_lookup *map; | ||
1566 | int ret; | ||
1567 | int i; | ||
1568 | |||
1569 | printk(KERN_INFO "btrfs relocating chunk %llu\n", | ||
1570 | (unsigned long long)chunk_offset); | ||
1571 | root = root->fs_info->chunk_root; | ||
1572 | extent_root = root->fs_info->extent_root; | ||
1573 | em_tree = &root->fs_info->mapping_tree.map_tree; | ||
1574 | |||
1575 | /* step one, relocate all the extents inside this chunk */ | ||
1576 | ret = btrfs_relocate_block_group(extent_root, chunk_offset); | ||
1577 | BUG_ON(ret); | ||
1578 | |||
1579 | trans = btrfs_start_transaction(root, 1); | ||
1580 | BUG_ON(!trans); | ||
1581 | |||
1582 | lock_chunks(root); | ||
1583 | |||
1584 | /* | ||
1585 | * step two, delete the device extents and the | ||
1586 | * chunk tree entries | ||
1587 | */ | ||
1588 | spin_lock(&em_tree->lock); | ||
1589 | em = lookup_extent_mapping(em_tree, chunk_offset, 1); | ||
1590 | spin_unlock(&em_tree->lock); | ||
1591 | |||
1592 | BUG_ON(em->start > chunk_offset || | ||
1593 | em->start + em->len < chunk_offset); | ||
1594 | map = (struct map_lookup *)em->bdev; | ||
1595 | |||
1596 | for (i = 0; i < map->num_stripes; i++) { | ||
1597 | ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, | ||
1598 | map->stripes[i].physical); | ||
1599 | BUG_ON(ret); | ||
1600 | |||
1601 | if (map->stripes[i].dev) { | ||
1602 | ret = btrfs_update_device(trans, map->stripes[i].dev); | ||
1603 | BUG_ON(ret); | ||
1604 | } | ||
1605 | } | ||
1606 | ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, | ||
1607 | chunk_offset); | ||
1608 | |||
1609 | BUG_ON(ret); | ||
1610 | |||
1611 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { | ||
1612 | ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); | ||
1613 | BUG_ON(ret); | ||
1614 | } | ||
1615 | |||
1616 | ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); | ||
1617 | BUG_ON(ret); | ||
1618 | |||
1619 | spin_lock(&em_tree->lock); | ||
1620 | remove_extent_mapping(em_tree, em); | ||
1621 | spin_unlock(&em_tree->lock); | ||
1622 | |||
1623 | kfree(map); | ||
1624 | em->bdev = NULL; | ||
1625 | |||
1626 | /* once for the tree */ | ||
1627 | free_extent_map(em); | ||
1628 | /* once for us */ | ||
1629 | free_extent_map(em); | ||
1630 | |||
1631 | unlock_chunks(root); | ||
1632 | btrfs_end_transaction(trans, root); | ||
1633 | return 0; | ||
1634 | } | ||
1635 | |||
1636 | static int btrfs_relocate_sys_chunks(struct btrfs_root *root) | ||
1637 | { | ||
1638 | struct btrfs_root *chunk_root = root->fs_info->chunk_root; | ||
1639 | struct btrfs_path *path; | ||
1640 | struct extent_buffer *leaf; | ||
1641 | struct btrfs_chunk *chunk; | ||
1642 | struct btrfs_key key; | ||
1643 | struct btrfs_key found_key; | ||
1644 | u64 chunk_tree = chunk_root->root_key.objectid; | ||
1645 | u64 chunk_type; | ||
1646 | int ret; | ||
1647 | |||
1648 | path = btrfs_alloc_path(); | ||
1649 | if (!path) | ||
1650 | return -ENOMEM; | ||
1651 | |||
1652 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; | ||
1653 | key.offset = (u64)-1; | ||
1654 | key.type = BTRFS_CHUNK_ITEM_KEY; | ||
1655 | |||
1656 | while (1) { | ||
1657 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); | ||
1658 | if (ret < 0) | ||
1659 | goto error; | ||
1660 | BUG_ON(ret == 0); | ||
1661 | |||
1662 | ret = btrfs_previous_item(chunk_root, path, key.objectid, | ||
1663 | key.type); | ||
1664 | if (ret < 0) | ||
1665 | goto error; | ||
1666 | if (ret > 0) | ||
1667 | break; | ||
1668 | |||
1669 | leaf = path->nodes[0]; | ||
1670 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
1671 | |||
1672 | chunk = btrfs_item_ptr(leaf, path->slots[0], | ||
1673 | struct btrfs_chunk); | ||
1674 | chunk_type = btrfs_chunk_type(leaf, chunk); | ||
1675 | btrfs_release_path(chunk_root, path); | ||
1676 | |||
1677 | if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { | ||
1678 | ret = btrfs_relocate_chunk(chunk_root, chunk_tree, | ||
1679 | found_key.objectid, | ||
1680 | found_key.offset); | ||
1681 | BUG_ON(ret); | ||
1682 | } | ||
1683 | |||
1684 | if (found_key.offset == 0) | ||
1685 | break; | ||
1686 | key.offset = found_key.offset - 1; | ||
1687 | } | ||
1688 | ret = 0; | ||
1689 | error: | ||
1690 | btrfs_free_path(path); | ||
1691 | return ret; | ||
1692 | } | ||
1693 | |||
1694 | static u64 div_factor(u64 num, int factor) | ||
1695 | { | ||
1696 | if (factor == 10) | ||
1697 | return num; | ||
1698 | num *= factor; | ||
1699 | do_div(num, 10); | ||
1700 | return num; | ||
1701 | } | ||
1702 | |||
1703 | int btrfs_balance(struct btrfs_root *dev_root) | ||
1704 | { | ||
1705 | int ret; | ||
1706 | struct list_head *cur; | ||
1707 | struct list_head *devices = &dev_root->fs_info->fs_devices->devices; | ||
1708 | struct btrfs_device *device; | ||
1709 | u64 old_size; | ||
1710 | u64 size_to_free; | ||
1711 | struct btrfs_path *path; | ||
1712 | struct btrfs_key key; | ||
1713 | struct btrfs_chunk *chunk; | ||
1714 | struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; | ||
1715 | struct btrfs_trans_handle *trans; | ||
1716 | struct btrfs_key found_key; | ||
1717 | |||
1718 | if (dev_root->fs_info->sb->s_flags & MS_RDONLY) | ||
1719 | return -EROFS; | ||
1720 | |||
1721 | mutex_lock(&dev_root->fs_info->volume_mutex); | ||
1722 | dev_root = dev_root->fs_info->dev_root; | ||
1723 | |||
1724 | /* step one make some room on all the devices */ | ||
1725 | list_for_each(cur, devices) { | ||
1726 | device = list_entry(cur, struct btrfs_device, dev_list); | ||
1727 | old_size = device->total_bytes; | ||
1728 | size_to_free = div_factor(old_size, 1); | ||
1729 | size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); | ||
1730 | if (!device->writeable || | ||
1731 | device->total_bytes - device->bytes_used > size_to_free) | ||
1732 | continue; | ||
1733 | |||
1734 | ret = btrfs_shrink_device(device, old_size - size_to_free); | ||
1735 | BUG_ON(ret); | ||
1736 | |||
1737 | trans = btrfs_start_transaction(dev_root, 1); | ||
1738 | BUG_ON(!trans); | ||
1739 | |||
1740 | ret = btrfs_grow_device(trans, device, old_size); | ||
1741 | BUG_ON(ret); | ||
1742 | |||
1743 | btrfs_end_transaction(trans, dev_root); | ||
1744 | } | ||
1745 | |||
1746 | /* step two, relocate all the chunks */ | ||
1747 | path = btrfs_alloc_path(); | ||
1748 | BUG_ON(!path); | ||
1749 | |||
1750 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; | ||
1751 | key.offset = (u64)-1; | ||
1752 | key.type = BTRFS_CHUNK_ITEM_KEY; | ||
1753 | |||
1754 | while (1) { | ||
1755 | ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); | ||
1756 | if (ret < 0) | ||
1757 | goto error; | ||
1758 | |||
1759 | /* | ||
1760 | * this shouldn't happen, it means the last relocate | ||
1761 | * failed | ||
1762 | */ | ||
1763 | if (ret == 0) | ||
1764 | break; | ||
1765 | |||
1766 | ret = btrfs_previous_item(chunk_root, path, 0, | ||
1767 | BTRFS_CHUNK_ITEM_KEY); | ||
1768 | if (ret) | ||
1769 | break; | ||
1770 | |||
1771 | btrfs_item_key_to_cpu(path->nodes[0], &found_key, | ||
1772 | path->slots[0]); | ||
1773 | if (found_key.objectid != key.objectid) | ||
1774 | break; | ||
1775 | |||
1776 | chunk = btrfs_item_ptr(path->nodes[0], | ||
1777 | path->slots[0], | ||
1778 | struct btrfs_chunk); | ||
1779 | key.offset = found_key.offset; | ||
1780 | /* chunk zero is special */ | ||
1781 | if (key.offset == 0) | ||
1782 | break; | ||
1783 | |||
1784 | btrfs_release_path(chunk_root, path); | ||
1785 | ret = btrfs_relocate_chunk(chunk_root, | ||
1786 | chunk_root->root_key.objectid, | ||
1787 | found_key.objectid, | ||
1788 | found_key.offset); | ||
1789 | BUG_ON(ret); | ||
1790 | } | ||
1791 | ret = 0; | ||
1792 | error: | ||
1793 | btrfs_free_path(path); | ||
1794 | mutex_unlock(&dev_root->fs_info->volume_mutex); | ||
1795 | return ret; | ||
1796 | } | ||
1797 | |||
1798 | /* | ||
1799 | * shrinking a device means finding all of the device extents past | ||
1800 | * the new size, and then following the back refs to the chunks. | ||
1801 | * The chunk relocation code actually frees the device extent | ||
1802 | */ | ||
1803 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) | ||
1804 | { | ||
1805 | struct btrfs_trans_handle *trans; | ||
1806 | struct btrfs_root *root = device->dev_root; | ||
1807 | struct btrfs_dev_extent *dev_extent = NULL; | ||
1808 | struct btrfs_path *path; | ||
1809 | u64 length; | ||
1810 | u64 chunk_tree; | ||
1811 | u64 chunk_objectid; | ||
1812 | u64 chunk_offset; | ||
1813 | int ret; | ||
1814 | int slot; | ||
1815 | struct extent_buffer *l; | ||
1816 | struct btrfs_key key; | ||
1817 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | ||
1818 | u64 old_total = btrfs_super_total_bytes(super_copy); | ||
1819 | u64 diff = device->total_bytes - new_size; | ||
1820 | |||
1821 | if (new_size >= device->total_bytes) | ||
1822 | return -EINVAL; | ||
1823 | |||
1824 | path = btrfs_alloc_path(); | ||
1825 | if (!path) | ||
1826 | return -ENOMEM; | ||
1827 | |||
1828 | trans = btrfs_start_transaction(root, 1); | ||
1829 | if (!trans) { | ||
1830 | ret = -ENOMEM; | ||
1831 | goto done; | ||
1832 | } | ||
1833 | |||
1834 | path->reada = 2; | ||
1835 | |||
1836 | lock_chunks(root); | ||
1837 | |||
1838 | device->total_bytes = new_size; | ||
1839 | if (device->writeable) | ||
1840 | device->fs_devices->total_rw_bytes -= diff; | ||
1841 | ret = btrfs_update_device(trans, device); | ||
1842 | if (ret) { | ||
1843 | unlock_chunks(root); | ||
1844 | btrfs_end_transaction(trans, root); | ||
1845 | goto done; | ||
1846 | } | ||
1847 | WARN_ON(diff > old_total); | ||
1848 | btrfs_set_super_total_bytes(super_copy, old_total - diff); | ||
1849 | unlock_chunks(root); | ||
1850 | btrfs_end_transaction(trans, root); | ||
1851 | |||
1852 | key.objectid = device->devid; | ||
1853 | key.offset = (u64)-1; | ||
1854 | key.type = BTRFS_DEV_EXTENT_KEY; | ||
1855 | |||
1856 | while (1) { | ||
1857 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
1858 | if (ret < 0) | ||
1859 | goto done; | ||
1860 | |||
1861 | ret = btrfs_previous_item(root, path, 0, key.type); | ||
1862 | if (ret < 0) | ||
1863 | goto done; | ||
1864 | if (ret) { | ||
1865 | ret = 0; | ||
1866 | goto done; | ||
1867 | } | ||
1868 | |||
1869 | l = path->nodes[0]; | ||
1870 | slot = path->slots[0]; | ||
1871 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | ||
1872 | |||
1873 | if (key.objectid != device->devid) | ||
1874 | goto done; | ||
1875 | |||
1876 | dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); | ||
1877 | length = btrfs_dev_extent_length(l, dev_extent); | ||
1878 | |||
1879 | if (key.offset + length <= new_size) | ||
1880 | goto done; | ||
1881 | |||
1882 | chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); | ||
1883 | chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); | ||
1884 | chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); | ||
1885 | btrfs_release_path(root, path); | ||
1886 | |||
1887 | ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, | ||
1888 | chunk_offset); | ||
1889 | if (ret) | ||
1890 | goto done; | ||
1891 | } | ||
1892 | |||
1893 | done: | ||
1894 | btrfs_free_path(path); | ||
1895 | return ret; | ||
1896 | } | ||
1897 | |||
1898 | static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, | ||
1899 | struct btrfs_root *root, | ||
1900 | struct btrfs_key *key, | ||
1901 | struct btrfs_chunk *chunk, int item_size) | ||
1902 | { | ||
1903 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | ||
1904 | struct btrfs_disk_key disk_key; | ||
1905 | u32 array_size; | ||
1906 | u8 *ptr; | ||
1907 | |||
1908 | array_size = btrfs_super_sys_array_size(super_copy); | ||
1909 | if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) | ||
1910 | return -EFBIG; | ||
1911 | |||
1912 | ptr = super_copy->sys_chunk_array + array_size; | ||
1913 | btrfs_cpu_key_to_disk(&disk_key, key); | ||
1914 | memcpy(ptr, &disk_key, sizeof(disk_key)); | ||
1915 | ptr += sizeof(disk_key); | ||
1916 | memcpy(ptr, chunk, item_size); | ||
1917 | item_size += sizeof(disk_key); | ||
1918 | btrfs_set_super_sys_array_size(super_copy, array_size + item_size); | ||
1919 | return 0; | ||
1920 | } | ||
1921 | |||
1922 | static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, | ||
1923 | int num_stripes, int sub_stripes) | ||
1924 | { | ||
1925 | if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) | ||
1926 | return calc_size; | ||
1927 | else if (type & BTRFS_BLOCK_GROUP_RAID10) | ||
1928 | return calc_size * (num_stripes / sub_stripes); | ||
1929 | else | ||
1930 | return calc_size * num_stripes; | ||
1931 | } | ||
1932 | |||
1933 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | ||
1934 | struct btrfs_root *extent_root, | ||
1935 | struct map_lookup **map_ret, | ||
1936 | u64 *num_bytes, u64 *stripe_size, | ||
1937 | u64 start, u64 type) | ||
1938 | { | ||
1939 | struct btrfs_fs_info *info = extent_root->fs_info; | ||
1940 | struct btrfs_device *device = NULL; | ||
1941 | struct btrfs_fs_devices *fs_devices = info->fs_devices; | ||
1942 | struct list_head *cur; | ||
1943 | struct map_lookup *map = NULL; | ||
1944 | struct extent_map_tree *em_tree; | ||
1945 | struct extent_map *em; | ||
1946 | struct list_head private_devs; | ||
1947 | int min_stripe_size = 1 * 1024 * 1024; | ||
1948 | u64 calc_size = 1024 * 1024 * 1024; | ||
1949 | u64 max_chunk_size = calc_size; | ||
1950 | u64 min_free; | ||
1951 | u64 avail; | ||
1952 | u64 max_avail = 0; | ||
1953 | u64 dev_offset; | ||
1954 | int num_stripes = 1; | ||
1955 | int min_stripes = 1; | ||
1956 | int sub_stripes = 0; | ||
1957 | int looped = 0; | ||
1958 | int ret; | ||
1959 | int index; | ||
1960 | int stripe_len = 64 * 1024; | ||
1961 | |||
1962 | if ((type & BTRFS_BLOCK_GROUP_RAID1) && | ||
1963 | (type & BTRFS_BLOCK_GROUP_DUP)) { | ||
1964 | WARN_ON(1); | ||
1965 | type &= ~BTRFS_BLOCK_GROUP_DUP; | ||
1966 | } | ||
1967 | if (list_empty(&fs_devices->alloc_list)) | ||
1968 | return -ENOSPC; | ||
1969 | |||
1970 | if (type & (BTRFS_BLOCK_GROUP_RAID0)) { | ||
1971 | num_stripes = fs_devices->rw_devices; | ||
1972 | min_stripes = 2; | ||
1973 | } | ||
1974 | if (type & (BTRFS_BLOCK_GROUP_DUP)) { | ||
1975 | num_stripes = 2; | ||
1976 | min_stripes = 2; | ||
1977 | } | ||
1978 | if (type & (BTRFS_BLOCK_GROUP_RAID1)) { | ||
1979 | num_stripes = min_t(u64, 2, fs_devices->rw_devices); | ||
1980 | if (num_stripes < 2) | ||
1981 | return -ENOSPC; | ||
1982 | min_stripes = 2; | ||
1983 | } | ||
1984 | if (type & (BTRFS_BLOCK_GROUP_RAID10)) { | ||
1985 | num_stripes = fs_devices->rw_devices; | ||
1986 | if (num_stripes < 4) | ||
1987 | return -ENOSPC; | ||
1988 | num_stripes &= ~(u32)1; | ||
1989 | sub_stripes = 2; | ||
1990 | min_stripes = 4; | ||
1991 | } | ||
1992 | |||
1993 | if (type & BTRFS_BLOCK_GROUP_DATA) { | ||
1994 | max_chunk_size = 10 * calc_size; | ||
1995 | min_stripe_size = 64 * 1024 * 1024; | ||
1996 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { | ||
1997 | max_chunk_size = 4 * calc_size; | ||
1998 | min_stripe_size = 32 * 1024 * 1024; | ||
1999 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { | ||
2000 | calc_size = 8 * 1024 * 1024; | ||
2001 | max_chunk_size = calc_size * 2; | ||
2002 | min_stripe_size = 1 * 1024 * 1024; | ||
2003 | } | ||
2004 | |||
2005 | /* we don't want a chunk larger than 10% of writeable space */ | ||
2006 | max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), | ||
2007 | max_chunk_size); | ||
2008 | |||
2009 | again: | ||
2010 | if (!map || map->num_stripes != num_stripes) { | ||
2011 | kfree(map); | ||
2012 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | ||
2013 | if (!map) | ||
2014 | return -ENOMEM; | ||
2015 | map->num_stripes = num_stripes; | ||
2016 | } | ||
2017 | |||
2018 | if (calc_size * num_stripes > max_chunk_size) { | ||
2019 | calc_size = max_chunk_size; | ||
2020 | do_div(calc_size, num_stripes); | ||
2021 | do_div(calc_size, stripe_len); | ||
2022 | calc_size *= stripe_len; | ||
2023 | } | ||
2024 | /* we don't want tiny stripes */ | ||
2025 | calc_size = max_t(u64, min_stripe_size, calc_size); | ||
2026 | |||
2027 | do_div(calc_size, stripe_len); | ||
2028 | calc_size *= stripe_len; | ||
2029 | |||
2030 | cur = fs_devices->alloc_list.next; | ||
2031 | index = 0; | ||
2032 | |||
2033 | if (type & BTRFS_BLOCK_GROUP_DUP) | ||
2034 | min_free = calc_size * 2; | ||
2035 | else | ||
2036 | min_free = calc_size; | ||
2037 | |||
2038 | /* | ||
2039 | * we add 1MB because we never use the first 1MB of the device, unless | ||
2040 | * we've looped, then we are likely allocating the maximum amount of | ||
2041 | * space left already | ||
2042 | */ | ||
2043 | if (!looped) | ||
2044 | min_free += 1024 * 1024; | ||
2045 | |||
2046 | INIT_LIST_HEAD(&private_devs); | ||
2047 | while (index < num_stripes) { | ||
2048 | device = list_entry(cur, struct btrfs_device, dev_alloc_list); | ||
2049 | BUG_ON(!device->writeable); | ||
2050 | if (device->total_bytes > device->bytes_used) | ||
2051 | avail = device->total_bytes - device->bytes_used; | ||
2052 | else | ||
2053 | avail = 0; | ||
2054 | cur = cur->next; | ||
2055 | |||
2056 | if (device->in_fs_metadata && avail >= min_free) { | ||
2057 | ret = find_free_dev_extent(trans, device, | ||
2058 | min_free, &dev_offset); | ||
2059 | if (ret == 0) { | ||
2060 | list_move_tail(&device->dev_alloc_list, | ||
2061 | &private_devs); | ||
2062 | map->stripes[index].dev = device; | ||
2063 | map->stripes[index].physical = dev_offset; | ||
2064 | index++; | ||
2065 | if (type & BTRFS_BLOCK_GROUP_DUP) { | ||
2066 | map->stripes[index].dev = device; | ||
2067 | map->stripes[index].physical = | ||
2068 | dev_offset + calc_size; | ||
2069 | index++; | ||
2070 | } | ||
2071 | } | ||
2072 | } else if (device->in_fs_metadata && avail > max_avail) | ||
2073 | max_avail = avail; | ||
2074 | if (cur == &fs_devices->alloc_list) | ||
2075 | break; | ||
2076 | } | ||
2077 | list_splice(&private_devs, &fs_devices->alloc_list); | ||
2078 | if (index < num_stripes) { | ||
2079 | if (index >= min_stripes) { | ||
2080 | num_stripes = index; | ||
2081 | if (type & (BTRFS_BLOCK_GROUP_RAID10)) { | ||
2082 | num_stripes /= sub_stripes; | ||
2083 | num_stripes *= sub_stripes; | ||
2084 | } | ||
2085 | looped = 1; | ||
2086 | goto again; | ||
2087 | } | ||
2088 | if (!looped && max_avail > 0) { | ||
2089 | looped = 1; | ||
2090 | calc_size = max_avail; | ||
2091 | goto again; | ||
2092 | } | ||
2093 | kfree(map); | ||
2094 | return -ENOSPC; | ||
2095 | } | ||
2096 | map->sector_size = extent_root->sectorsize; | ||
2097 | map->stripe_len = stripe_len; | ||
2098 | map->io_align = stripe_len; | ||
2099 | map->io_width = stripe_len; | ||
2100 | map->type = type; | ||
2101 | map->num_stripes = num_stripes; | ||
2102 | map->sub_stripes = sub_stripes; | ||
2103 | |||
2104 | *map_ret = map; | ||
2105 | *stripe_size = calc_size; | ||
2106 | *num_bytes = chunk_bytes_by_type(type, calc_size, | ||
2107 | num_stripes, sub_stripes); | ||
2108 | |||
2109 | em = alloc_extent_map(GFP_NOFS); | ||
2110 | if (!em) { | ||
2111 | kfree(map); | ||
2112 | return -ENOMEM; | ||
2113 | } | ||
2114 | em->bdev = (struct block_device *)map; | ||
2115 | em->start = start; | ||
2116 | em->len = *num_bytes; | ||
2117 | em->block_start = 0; | ||
2118 | em->block_len = em->len; | ||
2119 | |||
2120 | em_tree = &extent_root->fs_info->mapping_tree.map_tree; | ||
2121 | spin_lock(&em_tree->lock); | ||
2122 | ret = add_extent_mapping(em_tree, em); | ||
2123 | spin_unlock(&em_tree->lock); | ||
2124 | BUG_ON(ret); | ||
2125 | free_extent_map(em); | ||
2126 | |||
2127 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
2128 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
2129 | start, *num_bytes); | ||
2130 | BUG_ON(ret); | ||
2131 | |||
2132 | index = 0; | ||
2133 | while (index < map->num_stripes) { | ||
2134 | device = map->stripes[index].dev; | ||
2135 | dev_offset = map->stripes[index].physical; | ||
2136 | |||
2137 | ret = btrfs_alloc_dev_extent(trans, device, | ||
2138 | info->chunk_root->root_key.objectid, | ||
2139 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
2140 | start, dev_offset, calc_size); | ||
2141 | BUG_ON(ret); | ||
2142 | index++; | ||
2143 | } | ||
2144 | |||
2145 | return 0; | ||
2146 | } | ||
2147 | |||
2148 | static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | ||
2149 | struct btrfs_root *extent_root, | ||
2150 | struct map_lookup *map, u64 chunk_offset, | ||
2151 | u64 chunk_size, u64 stripe_size) | ||
2152 | { | ||
2153 | u64 dev_offset; | ||
2154 | struct btrfs_key key; | ||
2155 | struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; | ||
2156 | struct btrfs_device *device; | ||
2157 | struct btrfs_chunk *chunk; | ||
2158 | struct btrfs_stripe *stripe; | ||
2159 | size_t item_size = btrfs_chunk_item_size(map->num_stripes); | ||
2160 | int index = 0; | ||
2161 | int ret; | ||
2162 | |||
2163 | chunk = kzalloc(item_size, GFP_NOFS); | ||
2164 | if (!chunk) | ||
2165 | return -ENOMEM; | ||
2166 | |||
2167 | index = 0; | ||
2168 | while (index < map->num_stripes) { | ||
2169 | device = map->stripes[index].dev; | ||
2170 | device->bytes_used += stripe_size; | ||
2171 | ret = btrfs_update_device(trans, device); | ||
2172 | BUG_ON(ret); | ||
2173 | index++; | ||
2174 | } | ||
2175 | |||
2176 | index = 0; | ||
2177 | stripe = &chunk->stripe; | ||
2178 | while (index < map->num_stripes) { | ||
2179 | device = map->stripes[index].dev; | ||
2180 | dev_offset = map->stripes[index].physical; | ||
2181 | |||
2182 | btrfs_set_stack_stripe_devid(stripe, device->devid); | ||
2183 | btrfs_set_stack_stripe_offset(stripe, dev_offset); | ||
2184 | memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); | ||
2185 | stripe++; | ||
2186 | index++; | ||
2187 | } | ||
2188 | |||
2189 | btrfs_set_stack_chunk_length(chunk, chunk_size); | ||
2190 | btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); | ||
2191 | btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); | ||
2192 | btrfs_set_stack_chunk_type(chunk, map->type); | ||
2193 | btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); | ||
2194 | btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); | ||
2195 | btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); | ||
2196 | btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); | ||
2197 | btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); | ||
2198 | |||
2199 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; | ||
2200 | key.type = BTRFS_CHUNK_ITEM_KEY; | ||
2201 | key.offset = chunk_offset; | ||
2202 | |||
2203 | ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); | ||
2204 | BUG_ON(ret); | ||
2205 | |||
2206 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { | ||
2207 | ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, | ||
2208 | item_size); | ||
2209 | BUG_ON(ret); | ||
2210 | } | ||
2211 | kfree(chunk); | ||
2212 | return 0; | ||
2213 | } | ||
2214 | |||
2215 | /* | ||
2216 | * Chunk allocation falls into two parts. The first part does works | ||
2217 | * that make the new allocated chunk useable, but not do any operation | ||
2218 | * that modifies the chunk tree. The second part does the works that | ||
2219 | * require modifying the chunk tree. This division is important for the | ||
2220 | * bootstrap process of adding storage to a seed btrfs. | ||
2221 | */ | ||
2222 | int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | ||
2223 | struct btrfs_root *extent_root, u64 type) | ||
2224 | { | ||
2225 | u64 chunk_offset; | ||
2226 | u64 chunk_size; | ||
2227 | u64 stripe_size; | ||
2228 | struct map_lookup *map; | ||
2229 | struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; | ||
2230 | int ret; | ||
2231 | |||
2232 | ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
2233 | &chunk_offset); | ||
2234 | if (ret) | ||
2235 | return ret; | ||
2236 | |||
2237 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | ||
2238 | &stripe_size, chunk_offset, type); | ||
2239 | if (ret) | ||
2240 | return ret; | ||
2241 | |||
2242 | ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, | ||
2243 | chunk_size, stripe_size); | ||
2244 | BUG_ON(ret); | ||
2245 | return 0; | ||
2246 | } | ||
2247 | |||
2248 | static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | ||
2249 | struct btrfs_root *root, | ||
2250 | struct btrfs_device *device) | ||
2251 | { | ||
2252 | u64 chunk_offset; | ||
2253 | u64 sys_chunk_offset; | ||
2254 | u64 chunk_size; | ||
2255 | u64 sys_chunk_size; | ||
2256 | u64 stripe_size; | ||
2257 | u64 sys_stripe_size; | ||
2258 | u64 alloc_profile; | ||
2259 | struct map_lookup *map; | ||
2260 | struct map_lookup *sys_map; | ||
2261 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
2262 | struct btrfs_root *extent_root = fs_info->extent_root; | ||
2263 | int ret; | ||
2264 | |||
2265 | ret = find_next_chunk(fs_info->chunk_root, | ||
2266 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); | ||
2267 | BUG_ON(ret); | ||
2268 | |||
2269 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | ||
2270 | (fs_info->metadata_alloc_profile & | ||
2271 | fs_info->avail_metadata_alloc_bits); | ||
2272 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
2273 | |||
2274 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | ||
2275 | &stripe_size, chunk_offset, alloc_profile); | ||
2276 | BUG_ON(ret); | ||
2277 | |||
2278 | sys_chunk_offset = chunk_offset + chunk_size; | ||
2279 | |||
2280 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | | ||
2281 | (fs_info->system_alloc_profile & | ||
2282 | fs_info->avail_system_alloc_bits); | ||
2283 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
2284 | |||
2285 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, | ||
2286 | &sys_chunk_size, &sys_stripe_size, | ||
2287 | sys_chunk_offset, alloc_profile); | ||
2288 | BUG_ON(ret); | ||
2289 | |||
2290 | ret = btrfs_add_device(trans, fs_info->chunk_root, device); | ||
2291 | BUG_ON(ret); | ||
2292 | |||
2293 | /* | ||
2294 | * Modifying chunk tree needs allocating new blocks from both | ||
2295 | * system block group and metadata block group. So we only can | ||
2296 | * do operations require modifying the chunk tree after both | ||
2297 | * block groups were created. | ||
2298 | */ | ||
2299 | ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, | ||
2300 | chunk_size, stripe_size); | ||
2301 | BUG_ON(ret); | ||
2302 | |||
2303 | ret = __finish_chunk_alloc(trans, extent_root, sys_map, | ||
2304 | sys_chunk_offset, sys_chunk_size, | ||
2305 | sys_stripe_size); | ||
2306 | BUG_ON(ret); | ||
2307 | return 0; | ||
2308 | } | ||
2309 | |||
2310 | int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) | ||
2311 | { | ||
2312 | struct extent_map *em; | ||
2313 | struct map_lookup *map; | ||
2314 | struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; | ||
2315 | int readonly = 0; | ||
2316 | int i; | ||
2317 | |||
2318 | spin_lock(&map_tree->map_tree.lock); | ||
2319 | em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); | ||
2320 | spin_unlock(&map_tree->map_tree.lock); | ||
2321 | if (!em) | ||
2322 | return 1; | ||
2323 | |||
2324 | map = (struct map_lookup *)em->bdev; | ||
2325 | for (i = 0; i < map->num_stripes; i++) { | ||
2326 | if (!map->stripes[i].dev->writeable) { | ||
2327 | readonly = 1; | ||
2328 | break; | ||
2329 | } | ||
2330 | } | ||
2331 | free_extent_map(em); | ||
2332 | return readonly; | ||
2333 | } | ||
2334 | |||
2335 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree) | ||
2336 | { | ||
2337 | extent_map_tree_init(&tree->map_tree, GFP_NOFS); | ||
2338 | } | ||
2339 | |||
2340 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) | ||
2341 | { | ||
2342 | struct extent_map *em; | ||
2343 | |||
2344 | while (1) { | ||
2345 | spin_lock(&tree->map_tree.lock); | ||
2346 | em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); | ||
2347 | if (em) | ||
2348 | remove_extent_mapping(&tree->map_tree, em); | ||
2349 | spin_unlock(&tree->map_tree.lock); | ||
2350 | if (!em) | ||
2351 | break; | ||
2352 | kfree(em->bdev); | ||
2353 | /* once for us */ | ||
2354 | free_extent_map(em); | ||
2355 | /* once for the tree */ | ||
2356 | free_extent_map(em); | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) | ||
2361 | { | ||
2362 | struct extent_map *em; | ||
2363 | struct map_lookup *map; | ||
2364 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
2365 | int ret; | ||
2366 | |||
2367 | spin_lock(&em_tree->lock); | ||
2368 | em = lookup_extent_mapping(em_tree, logical, len); | ||
2369 | spin_unlock(&em_tree->lock); | ||
2370 | BUG_ON(!em); | ||
2371 | |||
2372 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
2373 | map = (struct map_lookup *)em->bdev; | ||
2374 | if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) | ||
2375 | ret = map->num_stripes; | ||
2376 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | ||
2377 | ret = map->sub_stripes; | ||
2378 | else | ||
2379 | ret = 1; | ||
2380 | free_extent_map(em); | ||
2381 | return ret; | ||
2382 | } | ||
2383 | |||
2384 | static int find_live_mirror(struct map_lookup *map, int first, int num, | ||
2385 | int optimal) | ||
2386 | { | ||
2387 | int i; | ||
2388 | if (map->stripes[optimal].dev->bdev) | ||
2389 | return optimal; | ||
2390 | for (i = first; i < first + num; i++) { | ||
2391 | if (map->stripes[i].dev->bdev) | ||
2392 | return i; | ||
2393 | } | ||
2394 | /* we couldn't find one that doesn't fail. Just return something | ||
2395 | * and the io error handling code will clean up eventually | ||
2396 | */ | ||
2397 | return optimal; | ||
2398 | } | ||
2399 | |||
2400 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | ||
2401 | u64 logical, u64 *length, | ||
2402 | struct btrfs_multi_bio **multi_ret, | ||
2403 | int mirror_num, struct page *unplug_page) | ||
2404 | { | ||
2405 | struct extent_map *em; | ||
2406 | struct map_lookup *map; | ||
2407 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
2408 | u64 offset; | ||
2409 | u64 stripe_offset; | ||
2410 | u64 stripe_nr; | ||
2411 | int stripes_allocated = 8; | ||
2412 | int stripes_required = 1; | ||
2413 | int stripe_index; | ||
2414 | int i; | ||
2415 | int num_stripes; | ||
2416 | int max_errors = 0; | ||
2417 | struct btrfs_multi_bio *multi = NULL; | ||
2418 | |||
2419 | if (multi_ret && !(rw & (1 << BIO_RW))) | ||
2420 | stripes_allocated = 1; | ||
2421 | again: | ||
2422 | if (multi_ret) { | ||
2423 | multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), | ||
2424 | GFP_NOFS); | ||
2425 | if (!multi) | ||
2426 | return -ENOMEM; | ||
2427 | |||
2428 | atomic_set(&multi->error, 0); | ||
2429 | } | ||
2430 | |||
2431 | spin_lock(&em_tree->lock); | ||
2432 | em = lookup_extent_mapping(em_tree, logical, *length); | ||
2433 | spin_unlock(&em_tree->lock); | ||
2434 | |||
2435 | if (!em && unplug_page) | ||
2436 | return 0; | ||
2437 | |||
2438 | if (!em) { | ||
2439 | printk(KERN_CRIT "unable to find logical %llu len %llu\n", | ||
2440 | (unsigned long long)logical, | ||
2441 | (unsigned long long)*length); | ||
2442 | BUG(); | ||
2443 | } | ||
2444 | |||
2445 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
2446 | map = (struct map_lookup *)em->bdev; | ||
2447 | offset = logical - em->start; | ||
2448 | |||
2449 | if (mirror_num > map->num_stripes) | ||
2450 | mirror_num = 0; | ||
2451 | |||
2452 | /* if our multi bio struct is too small, back off and try again */ | ||
2453 | if (rw & (1 << BIO_RW)) { | ||
2454 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | ||
2455 | BTRFS_BLOCK_GROUP_DUP)) { | ||
2456 | stripes_required = map->num_stripes; | ||
2457 | max_errors = 1; | ||
2458 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
2459 | stripes_required = map->sub_stripes; | ||
2460 | max_errors = 1; | ||
2461 | } | ||
2462 | } | ||
2463 | if (multi_ret && rw == WRITE && | ||
2464 | stripes_allocated < stripes_required) { | ||
2465 | stripes_allocated = map->num_stripes; | ||
2466 | free_extent_map(em); | ||
2467 | kfree(multi); | ||
2468 | goto again; | ||
2469 | } | ||
2470 | stripe_nr = offset; | ||
2471 | /* | ||
2472 | * stripe_nr counts the total number of stripes we have to stride | ||
2473 | * to get to this block | ||
2474 | */ | ||
2475 | do_div(stripe_nr, map->stripe_len); | ||
2476 | |||
2477 | stripe_offset = stripe_nr * map->stripe_len; | ||
2478 | BUG_ON(offset < stripe_offset); | ||
2479 | |||
2480 | /* stripe_offset is the offset of this block in its stripe*/ | ||
2481 | stripe_offset = offset - stripe_offset; | ||
2482 | |||
2483 | if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2484 | BTRFS_BLOCK_GROUP_RAID10 | | ||
2485 | BTRFS_BLOCK_GROUP_DUP)) { | ||
2486 | /* we limit the length of each bio to what fits in a stripe */ | ||
2487 | *length = min_t(u64, em->len - offset, | ||
2488 | map->stripe_len - stripe_offset); | ||
2489 | } else { | ||
2490 | *length = em->len - offset; | ||
2491 | } | ||
2492 | |||
2493 | if (!multi_ret && !unplug_page) | ||
2494 | goto out; | ||
2495 | |||
2496 | num_stripes = 1; | ||
2497 | stripe_index = 0; | ||
2498 | if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | ||
2499 | if (unplug_page || (rw & (1 << BIO_RW))) | ||
2500 | num_stripes = map->num_stripes; | ||
2501 | else if (mirror_num) | ||
2502 | stripe_index = mirror_num - 1; | ||
2503 | else { | ||
2504 | stripe_index = find_live_mirror(map, 0, | ||
2505 | map->num_stripes, | ||
2506 | current->pid % map->num_stripes); | ||
2507 | } | ||
2508 | |||
2509 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | ||
2510 | if (rw & (1 << BIO_RW)) | ||
2511 | num_stripes = map->num_stripes; | ||
2512 | else if (mirror_num) | ||
2513 | stripe_index = mirror_num - 1; | ||
2514 | |||
2515 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
2516 | int factor = map->num_stripes / map->sub_stripes; | ||
2517 | |||
2518 | stripe_index = do_div(stripe_nr, factor); | ||
2519 | stripe_index *= map->sub_stripes; | ||
2520 | |||
2521 | if (unplug_page || (rw & (1 << BIO_RW))) | ||
2522 | num_stripes = map->sub_stripes; | ||
2523 | else if (mirror_num) | ||
2524 | stripe_index += mirror_num - 1; | ||
2525 | else { | ||
2526 | stripe_index = find_live_mirror(map, stripe_index, | ||
2527 | map->sub_stripes, stripe_index + | ||
2528 | current->pid % map->sub_stripes); | ||
2529 | } | ||
2530 | } else { | ||
2531 | /* | ||
2532 | * after this do_div call, stripe_nr is the number of stripes | ||
2533 | * on this device we have to walk to find the data, and | ||
2534 | * stripe_index is the number of our device in the stripe array | ||
2535 | */ | ||
2536 | stripe_index = do_div(stripe_nr, map->num_stripes); | ||
2537 | } | ||
2538 | BUG_ON(stripe_index >= map->num_stripes); | ||
2539 | |||
2540 | for (i = 0; i < num_stripes; i++) { | ||
2541 | if (unplug_page) { | ||
2542 | struct btrfs_device *device; | ||
2543 | struct backing_dev_info *bdi; | ||
2544 | |||
2545 | device = map->stripes[stripe_index].dev; | ||
2546 | if (device->bdev) { | ||
2547 | bdi = blk_get_backing_dev_info(device->bdev); | ||
2548 | if (bdi->unplug_io_fn) | ||
2549 | bdi->unplug_io_fn(bdi, unplug_page); | ||
2550 | } | ||
2551 | } else { | ||
2552 | multi->stripes[i].physical = | ||
2553 | map->stripes[stripe_index].physical + | ||
2554 | stripe_offset + stripe_nr * map->stripe_len; | ||
2555 | multi->stripes[i].dev = map->stripes[stripe_index].dev; | ||
2556 | } | ||
2557 | stripe_index++; | ||
2558 | } | ||
2559 | if (multi_ret) { | ||
2560 | *multi_ret = multi; | ||
2561 | multi->num_stripes = num_stripes; | ||
2562 | multi->max_errors = max_errors; | ||
2563 | } | ||
2564 | out: | ||
2565 | free_extent_map(em); | ||
2566 | return 0; | ||
2567 | } | ||
2568 | |||
2569 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | ||
2570 | u64 logical, u64 *length, | ||
2571 | struct btrfs_multi_bio **multi_ret, int mirror_num) | ||
2572 | { | ||
2573 | return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, | ||
2574 | mirror_num, NULL); | ||
2575 | } | ||
2576 | |||
2577 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | ||
2578 | u64 chunk_start, u64 physical, u64 devid, | ||
2579 | u64 **logical, int *naddrs, int *stripe_len) | ||
2580 | { | ||
2581 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
2582 | struct extent_map *em; | ||
2583 | struct map_lookup *map; | ||
2584 | u64 *buf; | ||
2585 | u64 bytenr; | ||
2586 | u64 length; | ||
2587 | u64 stripe_nr; | ||
2588 | int i, j, nr = 0; | ||
2589 | |||
2590 | spin_lock(&em_tree->lock); | ||
2591 | em = lookup_extent_mapping(em_tree, chunk_start, 1); | ||
2592 | spin_unlock(&em_tree->lock); | ||
2593 | |||
2594 | BUG_ON(!em || em->start != chunk_start); | ||
2595 | map = (struct map_lookup *)em->bdev; | ||
2596 | |||
2597 | length = em->len; | ||
2598 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | ||
2599 | do_div(length, map->num_stripes / map->sub_stripes); | ||
2600 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | ||
2601 | do_div(length, map->num_stripes); | ||
2602 | |||
2603 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | ||
2604 | BUG_ON(!buf); | ||
2605 | |||
2606 | for (i = 0; i < map->num_stripes; i++) { | ||
2607 | if (devid && map->stripes[i].dev->devid != devid) | ||
2608 | continue; | ||
2609 | if (map->stripes[i].physical > physical || | ||
2610 | map->stripes[i].physical + length <= physical) | ||
2611 | continue; | ||
2612 | |||
2613 | stripe_nr = physical - map->stripes[i].physical; | ||
2614 | do_div(stripe_nr, map->stripe_len); | ||
2615 | |||
2616 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
2617 | stripe_nr = stripe_nr * map->num_stripes + i; | ||
2618 | do_div(stripe_nr, map->sub_stripes); | ||
2619 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | ||
2620 | stripe_nr = stripe_nr * map->num_stripes + i; | ||
2621 | } | ||
2622 | bytenr = chunk_start + stripe_nr * map->stripe_len; | ||
2623 | WARN_ON(nr >= map->num_stripes); | ||
2624 | for (j = 0; j < nr; j++) { | ||
2625 | if (buf[j] == bytenr) | ||
2626 | break; | ||
2627 | } | ||
2628 | if (j == nr) { | ||
2629 | WARN_ON(nr >= map->num_stripes); | ||
2630 | buf[nr++] = bytenr; | ||
2631 | } | ||
2632 | } | ||
2633 | |||
2634 | for (i = 0; i > nr; i++) { | ||
2635 | struct btrfs_multi_bio *multi; | ||
2636 | struct btrfs_bio_stripe *stripe; | ||
2637 | int ret; | ||
2638 | |||
2639 | length = 1; | ||
2640 | ret = btrfs_map_block(map_tree, WRITE, buf[i], | ||
2641 | &length, &multi, 0); | ||
2642 | BUG_ON(ret); | ||
2643 | |||
2644 | stripe = multi->stripes; | ||
2645 | for (j = 0; j < multi->num_stripes; j++) { | ||
2646 | if (stripe->physical >= physical && | ||
2647 | physical < stripe->physical + length) | ||
2648 | break; | ||
2649 | } | ||
2650 | BUG_ON(j >= multi->num_stripes); | ||
2651 | kfree(multi); | ||
2652 | } | ||
2653 | |||
2654 | *logical = buf; | ||
2655 | *naddrs = nr; | ||
2656 | *stripe_len = map->stripe_len; | ||
2657 | |||
2658 | free_extent_map(em); | ||
2659 | return 0; | ||
2660 | } | ||
2661 | |||
2662 | int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, | ||
2663 | u64 logical, struct page *page) | ||
2664 | { | ||
2665 | u64 length = PAGE_CACHE_SIZE; | ||
2666 | return __btrfs_map_block(map_tree, READ, logical, &length, | ||
2667 | NULL, 0, page); | ||
2668 | } | ||
2669 | |||
2670 | static void end_bio_multi_stripe(struct bio *bio, int err) | ||
2671 | { | ||
2672 | struct btrfs_multi_bio *multi = bio->bi_private; | ||
2673 | int is_orig_bio = 0; | ||
2674 | |||
2675 | if (err) | ||
2676 | atomic_inc(&multi->error); | ||
2677 | |||
2678 | if (bio == multi->orig_bio) | ||
2679 | is_orig_bio = 1; | ||
2680 | |||
2681 | if (atomic_dec_and_test(&multi->stripes_pending)) { | ||
2682 | if (!is_orig_bio) { | ||
2683 | bio_put(bio); | ||
2684 | bio = multi->orig_bio; | ||
2685 | } | ||
2686 | bio->bi_private = multi->private; | ||
2687 | bio->bi_end_io = multi->end_io; | ||
2688 | /* only send an error to the higher layers if it is | ||
2689 | * beyond the tolerance of the multi-bio | ||
2690 | */ | ||
2691 | if (atomic_read(&multi->error) > multi->max_errors) { | ||
2692 | err = -EIO; | ||
2693 | } else if (err) { | ||
2694 | /* | ||
2695 | * this bio is actually up to date, we didn't | ||
2696 | * go over the max number of errors | ||
2697 | */ | ||
2698 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
2699 | err = 0; | ||
2700 | } | ||
2701 | kfree(multi); | ||
2702 | |||
2703 | bio_endio(bio, err); | ||
2704 | } else if (!is_orig_bio) { | ||
2705 | bio_put(bio); | ||
2706 | } | ||
2707 | } | ||
2708 | |||
2709 | struct async_sched { | ||
2710 | struct bio *bio; | ||
2711 | int rw; | ||
2712 | struct btrfs_fs_info *info; | ||
2713 | struct btrfs_work work; | ||
2714 | }; | ||
2715 | |||
2716 | /* | ||
2717 | * see run_scheduled_bios for a description of why bios are collected for | ||
2718 | * async submit. | ||
2719 | * | ||
2720 | * This will add one bio to the pending list for a device and make sure | ||
2721 | * the work struct is scheduled. | ||
2722 | */ | ||
2723 | static noinline int schedule_bio(struct btrfs_root *root, | ||
2724 | struct btrfs_device *device, | ||
2725 | int rw, struct bio *bio) | ||
2726 | { | ||
2727 | int should_queue = 1; | ||
2728 | |||
2729 | /* don't bother with additional async steps for reads, right now */ | ||
2730 | if (!(rw & (1 << BIO_RW))) { | ||
2731 | bio_get(bio); | ||
2732 | submit_bio(rw, bio); | ||
2733 | bio_put(bio); | ||
2734 | return 0; | ||
2735 | } | ||
2736 | |||
2737 | /* | ||
2738 | * nr_async_bios allows us to reliably return congestion to the | ||
2739 | * higher layers. Otherwise, the async bio makes it appear we have | ||
2740 | * made progress against dirty pages when we've really just put it | ||
2741 | * on a queue for later | ||
2742 | */ | ||
2743 | atomic_inc(&root->fs_info->nr_async_bios); | ||
2744 | WARN_ON(bio->bi_next); | ||
2745 | bio->bi_next = NULL; | ||
2746 | bio->bi_rw |= rw; | ||
2747 | |||
2748 | spin_lock(&device->io_lock); | ||
2749 | |||
2750 | if (device->pending_bio_tail) | ||
2751 | device->pending_bio_tail->bi_next = bio; | ||
2752 | |||
2753 | device->pending_bio_tail = bio; | ||
2754 | if (!device->pending_bios) | ||
2755 | device->pending_bios = bio; | ||
2756 | if (device->running_pending) | ||
2757 | should_queue = 0; | ||
2758 | |||
2759 | spin_unlock(&device->io_lock); | ||
2760 | |||
2761 | if (should_queue) | ||
2762 | btrfs_queue_worker(&root->fs_info->submit_workers, | ||
2763 | &device->work); | ||
2764 | return 0; | ||
2765 | } | ||
2766 | |||
2767 | int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | ||
2768 | int mirror_num, int async_submit) | ||
2769 | { | ||
2770 | struct btrfs_mapping_tree *map_tree; | ||
2771 | struct btrfs_device *dev; | ||
2772 | struct bio *first_bio = bio; | ||
2773 | u64 logical = (u64)bio->bi_sector << 9; | ||
2774 | u64 length = 0; | ||
2775 | u64 map_length; | ||
2776 | struct btrfs_multi_bio *multi = NULL; | ||
2777 | int ret; | ||
2778 | int dev_nr = 0; | ||
2779 | int total_devs = 1; | ||
2780 | |||
2781 | length = bio->bi_size; | ||
2782 | map_tree = &root->fs_info->mapping_tree; | ||
2783 | map_length = length; | ||
2784 | |||
2785 | ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, | ||
2786 | mirror_num); | ||
2787 | BUG_ON(ret); | ||
2788 | |||
2789 | total_devs = multi->num_stripes; | ||
2790 | if (map_length < length) { | ||
2791 | printk(KERN_CRIT "mapping failed logical %llu bio len %llu " | ||
2792 | "len %llu\n", (unsigned long long)logical, | ||
2793 | (unsigned long long)length, | ||
2794 | (unsigned long long)map_length); | ||
2795 | BUG(); | ||
2796 | } | ||
2797 | multi->end_io = first_bio->bi_end_io; | ||
2798 | multi->private = first_bio->bi_private; | ||
2799 | multi->orig_bio = first_bio; | ||
2800 | atomic_set(&multi->stripes_pending, multi->num_stripes); | ||
2801 | |||
2802 | while (dev_nr < total_devs) { | ||
2803 | if (total_devs > 1) { | ||
2804 | if (dev_nr < total_devs - 1) { | ||
2805 | bio = bio_clone(first_bio, GFP_NOFS); | ||
2806 | BUG_ON(!bio); | ||
2807 | } else { | ||
2808 | bio = first_bio; | ||
2809 | } | ||
2810 | bio->bi_private = multi; | ||
2811 | bio->bi_end_io = end_bio_multi_stripe; | ||
2812 | } | ||
2813 | bio->bi_sector = multi->stripes[dev_nr].physical >> 9; | ||
2814 | dev = multi->stripes[dev_nr].dev; | ||
2815 | BUG_ON(rw == WRITE && !dev->writeable); | ||
2816 | if (dev && dev->bdev) { | ||
2817 | bio->bi_bdev = dev->bdev; | ||
2818 | if (async_submit) | ||
2819 | schedule_bio(root, dev, rw, bio); | ||
2820 | else | ||
2821 | submit_bio(rw, bio); | ||
2822 | } else { | ||
2823 | bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; | ||
2824 | bio->bi_sector = logical >> 9; | ||
2825 | bio_endio(bio, -EIO); | ||
2826 | } | ||
2827 | dev_nr++; | ||
2828 | } | ||
2829 | if (total_devs == 1) | ||
2830 | kfree(multi); | ||
2831 | return 0; | ||
2832 | } | ||
2833 | |||
2834 | struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, | ||
2835 | u8 *uuid, u8 *fsid) | ||
2836 | { | ||
2837 | struct btrfs_device *device; | ||
2838 | struct btrfs_fs_devices *cur_devices; | ||
2839 | |||
2840 | cur_devices = root->fs_info->fs_devices; | ||
2841 | while (cur_devices) { | ||
2842 | if (!fsid || | ||
2843 | !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { | ||
2844 | device = __find_device(&cur_devices->devices, | ||
2845 | devid, uuid); | ||
2846 | if (device) | ||
2847 | return device; | ||
2848 | } | ||
2849 | cur_devices = cur_devices->seed; | ||
2850 | } | ||
2851 | return NULL; | ||
2852 | } | ||
2853 | |||
2854 | static struct btrfs_device *add_missing_dev(struct btrfs_root *root, | ||
2855 | u64 devid, u8 *dev_uuid) | ||
2856 | { | ||
2857 | struct btrfs_device *device; | ||
2858 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; | ||
2859 | |||
2860 | device = kzalloc(sizeof(*device), GFP_NOFS); | ||
2861 | if (!device) | ||
2862 | return NULL; | ||
2863 | list_add(&device->dev_list, | ||
2864 | &fs_devices->devices); | ||
2865 | device->barriers = 1; | ||
2866 | device->dev_root = root->fs_info->dev_root; | ||
2867 | device->devid = devid; | ||
2868 | device->work.func = pending_bios_fn; | ||
2869 | device->fs_devices = fs_devices; | ||
2870 | fs_devices->num_devices++; | ||
2871 | spin_lock_init(&device->io_lock); | ||
2872 | INIT_LIST_HEAD(&device->dev_alloc_list); | ||
2873 | memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); | ||
2874 | return device; | ||
2875 | } | ||
2876 | |||
2877 | static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, | ||
2878 | struct extent_buffer *leaf, | ||
2879 | struct btrfs_chunk *chunk) | ||
2880 | { | ||
2881 | struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; | ||
2882 | struct map_lookup *map; | ||
2883 | struct extent_map *em; | ||
2884 | u64 logical; | ||
2885 | u64 length; | ||
2886 | u64 devid; | ||
2887 | u8 uuid[BTRFS_UUID_SIZE]; | ||
2888 | int num_stripes; | ||
2889 | int ret; | ||
2890 | int i; | ||
2891 | |||
2892 | logical = key->offset; | ||
2893 | length = btrfs_chunk_length(leaf, chunk); | ||
2894 | |||
2895 | spin_lock(&map_tree->map_tree.lock); | ||
2896 | em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); | ||
2897 | spin_unlock(&map_tree->map_tree.lock); | ||
2898 | |||
2899 | /* already mapped? */ | ||
2900 | if (em && em->start <= logical && em->start + em->len > logical) { | ||
2901 | free_extent_map(em); | ||
2902 | return 0; | ||
2903 | } else if (em) { | ||
2904 | free_extent_map(em); | ||
2905 | } | ||
2906 | |||
2907 | map = kzalloc(sizeof(*map), GFP_NOFS); | ||
2908 | if (!map) | ||
2909 | return -ENOMEM; | ||
2910 | |||
2911 | em = alloc_extent_map(GFP_NOFS); | ||
2912 | if (!em) | ||
2913 | return -ENOMEM; | ||
2914 | num_stripes = btrfs_chunk_num_stripes(leaf, chunk); | ||
2915 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | ||
2916 | if (!map) { | ||
2917 | free_extent_map(em); | ||
2918 | return -ENOMEM; | ||
2919 | } | ||
2920 | |||
2921 | em->bdev = (struct block_device *)map; | ||
2922 | em->start = logical; | ||
2923 | em->len = length; | ||
2924 | em->block_start = 0; | ||
2925 | em->block_len = em->len; | ||
2926 | |||
2927 | map->num_stripes = num_stripes; | ||
2928 | map->io_width = btrfs_chunk_io_width(leaf, chunk); | ||
2929 | map->io_align = btrfs_chunk_io_align(leaf, chunk); | ||
2930 | map->sector_size = btrfs_chunk_sector_size(leaf, chunk); | ||
2931 | map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); | ||
2932 | map->type = btrfs_chunk_type(leaf, chunk); | ||
2933 | map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); | ||
2934 | for (i = 0; i < num_stripes; i++) { | ||
2935 | map->stripes[i].physical = | ||
2936 | btrfs_stripe_offset_nr(leaf, chunk, i); | ||
2937 | devid = btrfs_stripe_devid_nr(leaf, chunk, i); | ||
2938 | read_extent_buffer(leaf, uuid, (unsigned long) | ||
2939 | btrfs_stripe_dev_uuid_nr(chunk, i), | ||
2940 | BTRFS_UUID_SIZE); | ||
2941 | map->stripes[i].dev = btrfs_find_device(root, devid, uuid, | ||
2942 | NULL); | ||
2943 | if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { | ||
2944 | kfree(map); | ||
2945 | free_extent_map(em); | ||
2946 | return -EIO; | ||
2947 | } | ||
2948 | if (!map->stripes[i].dev) { | ||
2949 | map->stripes[i].dev = | ||
2950 | add_missing_dev(root, devid, uuid); | ||
2951 | if (!map->stripes[i].dev) { | ||
2952 | kfree(map); | ||
2953 | free_extent_map(em); | ||
2954 | return -EIO; | ||
2955 | } | ||
2956 | } | ||
2957 | map->stripes[i].dev->in_fs_metadata = 1; | ||
2958 | } | ||
2959 | |||
2960 | spin_lock(&map_tree->map_tree.lock); | ||
2961 | ret = add_extent_mapping(&map_tree->map_tree, em); | ||
2962 | spin_unlock(&map_tree->map_tree.lock); | ||
2963 | BUG_ON(ret); | ||
2964 | free_extent_map(em); | ||
2965 | |||
2966 | return 0; | ||
2967 | } | ||
2968 | |||
2969 | static int fill_device_from_item(struct extent_buffer *leaf, | ||
2970 | struct btrfs_dev_item *dev_item, | ||
2971 | struct btrfs_device *device) | ||
2972 | { | ||
2973 | unsigned long ptr; | ||
2974 | |||
2975 | device->devid = btrfs_device_id(leaf, dev_item); | ||
2976 | device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); | ||
2977 | device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); | ||
2978 | device->type = btrfs_device_type(leaf, dev_item); | ||
2979 | device->io_align = btrfs_device_io_align(leaf, dev_item); | ||
2980 | device->io_width = btrfs_device_io_width(leaf, dev_item); | ||
2981 | device->sector_size = btrfs_device_sector_size(leaf, dev_item); | ||
2982 | |||
2983 | ptr = (unsigned long)btrfs_device_uuid(dev_item); | ||
2984 | read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); | ||
2985 | |||
2986 | return 0; | ||
2987 | } | ||
2988 | |||
2989 | static int open_seed_devices(struct btrfs_root *root, u8 *fsid) | ||
2990 | { | ||
2991 | struct btrfs_fs_devices *fs_devices; | ||
2992 | int ret; | ||
2993 | |||
2994 | mutex_lock(&uuid_mutex); | ||
2995 | |||
2996 | fs_devices = root->fs_info->fs_devices->seed; | ||
2997 | while (fs_devices) { | ||
2998 | if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { | ||
2999 | ret = 0; | ||
3000 | goto out; | ||
3001 | } | ||
3002 | fs_devices = fs_devices->seed; | ||
3003 | } | ||
3004 | |||
3005 | fs_devices = find_fsid(fsid); | ||
3006 | if (!fs_devices) { | ||
3007 | ret = -ENOENT; | ||
3008 | goto out; | ||
3009 | } | ||
3010 | |||
3011 | fs_devices = clone_fs_devices(fs_devices); | ||
3012 | if (IS_ERR(fs_devices)) { | ||
3013 | ret = PTR_ERR(fs_devices); | ||
3014 | goto out; | ||
3015 | } | ||
3016 | |||
3017 | ret = __btrfs_open_devices(fs_devices, FMODE_READ, | ||
3018 | root->fs_info->bdev_holder); | ||
3019 | if (ret) | ||
3020 | goto out; | ||
3021 | |||
3022 | if (!fs_devices->seeding) { | ||
3023 | __btrfs_close_devices(fs_devices); | ||
3024 | free_fs_devices(fs_devices); | ||
3025 | ret = -EINVAL; | ||
3026 | goto out; | ||
3027 | } | ||
3028 | |||
3029 | fs_devices->seed = root->fs_info->fs_devices->seed; | ||
3030 | root->fs_info->fs_devices->seed = fs_devices; | ||
3031 | out: | ||
3032 | mutex_unlock(&uuid_mutex); | ||
3033 | return ret; | ||
3034 | } | ||
3035 | |||
3036 | static int read_one_dev(struct btrfs_root *root, | ||
3037 | struct extent_buffer *leaf, | ||
3038 | struct btrfs_dev_item *dev_item) | ||
3039 | { | ||
3040 | struct btrfs_device *device; | ||
3041 | u64 devid; | ||
3042 | int ret; | ||
3043 | u8 fs_uuid[BTRFS_UUID_SIZE]; | ||
3044 | u8 dev_uuid[BTRFS_UUID_SIZE]; | ||
3045 | |||
3046 | devid = btrfs_device_id(leaf, dev_item); | ||
3047 | read_extent_buffer(leaf, dev_uuid, | ||
3048 | (unsigned long)btrfs_device_uuid(dev_item), | ||
3049 | BTRFS_UUID_SIZE); | ||
3050 | read_extent_buffer(leaf, fs_uuid, | ||
3051 | (unsigned long)btrfs_device_fsid(dev_item), | ||
3052 | BTRFS_UUID_SIZE); | ||
3053 | |||
3054 | if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { | ||
3055 | ret = open_seed_devices(root, fs_uuid); | ||
3056 | if (ret && !btrfs_test_opt(root, DEGRADED)) | ||
3057 | return ret; | ||
3058 | } | ||
3059 | |||
3060 | device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); | ||
3061 | if (!device || !device->bdev) { | ||
3062 | if (!btrfs_test_opt(root, DEGRADED)) | ||
3063 | return -EIO; | ||
3064 | |||
3065 | if (!device) { | ||
3066 | printk(KERN_WARNING "warning devid %llu missing\n", | ||
3067 | (unsigned long long)devid); | ||
3068 | device = add_missing_dev(root, devid, dev_uuid); | ||
3069 | if (!device) | ||
3070 | return -ENOMEM; | ||
3071 | } | ||
3072 | } | ||
3073 | |||
3074 | if (device->fs_devices != root->fs_info->fs_devices) { | ||
3075 | BUG_ON(device->writeable); | ||
3076 | if (device->generation != | ||
3077 | btrfs_device_generation(leaf, dev_item)) | ||
3078 | return -EINVAL; | ||
3079 | } | ||
3080 | |||
3081 | fill_device_from_item(leaf, dev_item, device); | ||
3082 | device->dev_root = root->fs_info->dev_root; | ||
3083 | device->in_fs_metadata = 1; | ||
3084 | if (device->writeable) | ||
3085 | device->fs_devices->total_rw_bytes += device->total_bytes; | ||
3086 | ret = 0; | ||
3087 | return ret; | ||
3088 | } | ||
3089 | |||
3090 | int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) | ||
3091 | { | ||
3092 | struct btrfs_dev_item *dev_item; | ||
3093 | |||
3094 | dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, | ||
3095 | dev_item); | ||
3096 | return read_one_dev(root, buf, dev_item); | ||
3097 | } | ||
3098 | |||
3099 | int btrfs_read_sys_array(struct btrfs_root *root) | ||
3100 | { | ||
3101 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | ||
3102 | struct extent_buffer *sb; | ||
3103 | struct btrfs_disk_key *disk_key; | ||
3104 | struct btrfs_chunk *chunk; | ||
3105 | u8 *ptr; | ||
3106 | unsigned long sb_ptr; | ||
3107 | int ret = 0; | ||
3108 | u32 num_stripes; | ||
3109 | u32 array_size; | ||
3110 | u32 len = 0; | ||
3111 | u32 cur; | ||
3112 | struct btrfs_key key; | ||
3113 | |||
3114 | sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, | ||
3115 | BTRFS_SUPER_INFO_SIZE); | ||
3116 | if (!sb) | ||
3117 | return -ENOMEM; | ||
3118 | btrfs_set_buffer_uptodate(sb); | ||
3119 | write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); | ||
3120 | array_size = btrfs_super_sys_array_size(super_copy); | ||
3121 | |||
3122 | ptr = super_copy->sys_chunk_array; | ||
3123 | sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); | ||
3124 | cur = 0; | ||
3125 | |||
3126 | while (cur < array_size) { | ||
3127 | disk_key = (struct btrfs_disk_key *)ptr; | ||
3128 | btrfs_disk_key_to_cpu(&key, disk_key); | ||
3129 | |||
3130 | len = sizeof(*disk_key); ptr += len; | ||
3131 | sb_ptr += len; | ||
3132 | cur += len; | ||
3133 | |||
3134 | if (key.type == BTRFS_CHUNK_ITEM_KEY) { | ||
3135 | chunk = (struct btrfs_chunk *)sb_ptr; | ||
3136 | ret = read_one_chunk(root, &key, sb, chunk); | ||
3137 | if (ret) | ||
3138 | break; | ||
3139 | num_stripes = btrfs_chunk_num_stripes(sb, chunk); | ||
3140 | len = btrfs_chunk_item_size(num_stripes); | ||
3141 | } else { | ||
3142 | ret = -EIO; | ||
3143 | break; | ||
3144 | } | ||
3145 | ptr += len; | ||
3146 | sb_ptr += len; | ||
3147 | cur += len; | ||
3148 | } | ||
3149 | free_extent_buffer(sb); | ||
3150 | return ret; | ||
3151 | } | ||
3152 | |||
3153 | int btrfs_read_chunk_tree(struct btrfs_root *root) | ||
3154 | { | ||
3155 | struct btrfs_path *path; | ||
3156 | struct extent_buffer *leaf; | ||
3157 | struct btrfs_key key; | ||
3158 | struct btrfs_key found_key; | ||
3159 | int ret; | ||
3160 | int slot; | ||
3161 | |||
3162 | root = root->fs_info->chunk_root; | ||
3163 | |||
3164 | path = btrfs_alloc_path(); | ||
3165 | if (!path) | ||
3166 | return -ENOMEM; | ||
3167 | |||
3168 | /* first we search for all of the device items, and then we | ||
3169 | * read in all of the chunk items. This way we can create chunk | ||
3170 | * mappings that reference all of the devices that are afound | ||
3171 | */ | ||
3172 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | ||
3173 | key.offset = 0; | ||
3174 | key.type = 0; | ||
3175 | again: | ||
3176 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
3177 | while (1) { | ||
3178 | leaf = path->nodes[0]; | ||
3179 | slot = path->slots[0]; | ||
3180 | if (slot >= btrfs_header_nritems(leaf)) { | ||
3181 | ret = btrfs_next_leaf(root, path); | ||
3182 | if (ret == 0) | ||
3183 | continue; | ||
3184 | if (ret < 0) | ||
3185 | goto error; | ||
3186 | break; | ||
3187 | } | ||
3188 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
3189 | if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { | ||
3190 | if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) | ||
3191 | break; | ||
3192 | if (found_key.type == BTRFS_DEV_ITEM_KEY) { | ||
3193 | struct btrfs_dev_item *dev_item; | ||
3194 | dev_item = btrfs_item_ptr(leaf, slot, | ||
3195 | struct btrfs_dev_item); | ||
3196 | ret = read_one_dev(root, leaf, dev_item); | ||
3197 | if (ret) | ||
3198 | goto error; | ||
3199 | } | ||
3200 | } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { | ||
3201 | struct btrfs_chunk *chunk; | ||
3202 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); | ||
3203 | ret = read_one_chunk(root, &found_key, leaf, chunk); | ||
3204 | if (ret) | ||
3205 | goto error; | ||
3206 | } | ||
3207 | path->slots[0]++; | ||
3208 | } | ||
3209 | if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { | ||
3210 | key.objectid = 0; | ||
3211 | btrfs_release_path(root, path); | ||
3212 | goto again; | ||
3213 | } | ||
3214 | ret = 0; | ||
3215 | error: | ||
3216 | btrfs_free_path(path); | ||
3217 | return ret; | ||
3218 | } | ||
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 000000000000..86c44e9ae110 --- /dev/null +++ b/fs/btrfs/volumes.h | |||
@@ -0,0 +1,162 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_VOLUMES_ | ||
20 | #define __BTRFS_VOLUMES_ | ||
21 | |||
22 | #include <linux/bio.h> | ||
23 | #include "async-thread.h" | ||
24 | |||
25 | struct buffer_head; | ||
26 | struct btrfs_device { | ||
27 | struct list_head dev_list; | ||
28 | struct list_head dev_alloc_list; | ||
29 | struct btrfs_fs_devices *fs_devices; | ||
30 | struct btrfs_root *dev_root; | ||
31 | struct bio *pending_bios; | ||
32 | struct bio *pending_bio_tail; | ||
33 | int running_pending; | ||
34 | u64 generation; | ||
35 | |||
36 | int barriers; | ||
37 | int writeable; | ||
38 | int in_fs_metadata; | ||
39 | |||
40 | spinlock_t io_lock; | ||
41 | |||
42 | struct block_device *bdev; | ||
43 | |||
44 | /* the mode sent to open_bdev_exclusive */ | ||
45 | fmode_t mode; | ||
46 | |||
47 | char *name; | ||
48 | |||
49 | /* the internal btrfs device id */ | ||
50 | u64 devid; | ||
51 | |||
52 | /* size of the device */ | ||
53 | u64 total_bytes; | ||
54 | |||
55 | /* bytes used */ | ||
56 | u64 bytes_used; | ||
57 | |||
58 | /* optimal io alignment for this device */ | ||
59 | u32 io_align; | ||
60 | |||
61 | /* optimal io width for this device */ | ||
62 | u32 io_width; | ||
63 | |||
64 | /* minimal io size for this device */ | ||
65 | u32 sector_size; | ||
66 | |||
67 | /* type and info about this device */ | ||
68 | u64 type; | ||
69 | |||
70 | /* physical drive uuid (or lvm uuid) */ | ||
71 | u8 uuid[BTRFS_UUID_SIZE]; | ||
72 | |||
73 | struct btrfs_work work; | ||
74 | }; | ||
75 | |||
76 | struct btrfs_fs_devices { | ||
77 | u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ | ||
78 | |||
79 | /* the device with this id has the most recent coyp of the super */ | ||
80 | u64 latest_devid; | ||
81 | u64 latest_trans; | ||
82 | u64 num_devices; | ||
83 | u64 open_devices; | ||
84 | u64 rw_devices; | ||
85 | u64 total_rw_bytes; | ||
86 | struct block_device *latest_bdev; | ||
87 | /* all of the devices in the FS */ | ||
88 | struct list_head devices; | ||
89 | |||
90 | /* devices not currently being allocated */ | ||
91 | struct list_head alloc_list; | ||
92 | struct list_head list; | ||
93 | |||
94 | struct btrfs_fs_devices *seed; | ||
95 | int seeding; | ||
96 | |||
97 | int opened; | ||
98 | }; | ||
99 | |||
100 | struct btrfs_bio_stripe { | ||
101 | struct btrfs_device *dev; | ||
102 | u64 physical; | ||
103 | }; | ||
104 | |||
105 | struct btrfs_multi_bio { | ||
106 | atomic_t stripes_pending; | ||
107 | bio_end_io_t *end_io; | ||
108 | struct bio *orig_bio; | ||
109 | void *private; | ||
110 | atomic_t error; | ||
111 | int max_errors; | ||
112 | int num_stripes; | ||
113 | struct btrfs_bio_stripe stripes[]; | ||
114 | }; | ||
115 | |||
116 | #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ | ||
117 | (sizeof(struct btrfs_bio_stripe) * (n))) | ||
118 | |||
119 | int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | ||
120 | struct btrfs_device *device, | ||
121 | u64 chunk_tree, u64 chunk_objectid, | ||
122 | u64 chunk_offset, u64 start, u64 num_bytes); | ||
123 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | ||
124 | u64 logical, u64 *length, | ||
125 | struct btrfs_multi_bio **multi_ret, int mirror_num); | ||
126 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | ||
127 | u64 chunk_start, u64 physical, u64 devid, | ||
128 | u64 **logical, int *naddrs, int *stripe_len); | ||
129 | int btrfs_read_sys_array(struct btrfs_root *root); | ||
130 | int btrfs_read_chunk_tree(struct btrfs_root *root); | ||
131 | int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | ||
132 | struct btrfs_root *extent_root, u64 type); | ||
133 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree); | ||
134 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); | ||
135 | int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | ||
136 | int mirror_num, int async_submit); | ||
137 | int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); | ||
138 | int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | ||
139 | fmode_t flags, void *holder); | ||
140 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | ||
141 | struct btrfs_fs_devices **fs_devices_ret); | ||
142 | int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); | ||
143 | int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); | ||
144 | int btrfs_add_device(struct btrfs_trans_handle *trans, | ||
145 | struct btrfs_root *root, | ||
146 | struct btrfs_device *device); | ||
147 | int btrfs_rm_device(struct btrfs_root *root, char *device_path); | ||
148 | int btrfs_cleanup_fs_uuids(void); | ||
149 | int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); | ||
150 | int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, | ||
151 | u64 logical, struct page *page); | ||
152 | int btrfs_grow_device(struct btrfs_trans_handle *trans, | ||
153 | struct btrfs_device *device, u64 new_size); | ||
154 | struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, | ||
155 | u8 *uuid, u8 *fsid); | ||
156 | int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); | ||
157 | int btrfs_init_new_device(struct btrfs_root *root, char *path); | ||
158 | int btrfs_balance(struct btrfs_root *dev_root); | ||
159 | void btrfs_unlock_volumes(void); | ||
160 | void btrfs_lock_volumes(void); | ||
161 | int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); | ||
162 | #endif | ||
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 000000000000..7f332e270894 --- /dev/null +++ b/fs/btrfs/xattr.c | |||
@@ -0,0 +1,322 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/init.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/rwsem.h> | ||
23 | #include <linux/xattr.h> | ||
24 | #include "ctree.h" | ||
25 | #include "btrfs_inode.h" | ||
26 | #include "transaction.h" | ||
27 | #include "xattr.h" | ||
28 | #include "disk-io.h" | ||
29 | |||
30 | |||
31 | ssize_t __btrfs_getxattr(struct inode *inode, const char *name, | ||
32 | void *buffer, size_t size) | ||
33 | { | ||
34 | struct btrfs_dir_item *di; | ||
35 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
36 | struct btrfs_path *path; | ||
37 | struct extent_buffer *leaf; | ||
38 | int ret = 0; | ||
39 | unsigned long data_ptr; | ||
40 | |||
41 | path = btrfs_alloc_path(); | ||
42 | if (!path) | ||
43 | return -ENOMEM; | ||
44 | |||
45 | /* lookup the xattr by name */ | ||
46 | di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, | ||
47 | strlen(name), 0); | ||
48 | if (!di || IS_ERR(di)) { | ||
49 | ret = -ENODATA; | ||
50 | goto out; | ||
51 | } | ||
52 | |||
53 | leaf = path->nodes[0]; | ||
54 | /* if size is 0, that means we want the size of the attr */ | ||
55 | if (!size) { | ||
56 | ret = btrfs_dir_data_len(leaf, di); | ||
57 | goto out; | ||
58 | } | ||
59 | |||
60 | /* now get the data out of our dir_item */ | ||
61 | if (btrfs_dir_data_len(leaf, di) > size) { | ||
62 | ret = -ERANGE; | ||
63 | goto out; | ||
64 | } | ||
65 | data_ptr = (unsigned long)((char *)(di + 1) + | ||
66 | btrfs_dir_name_len(leaf, di)); | ||
67 | read_extent_buffer(leaf, buffer, data_ptr, | ||
68 | btrfs_dir_data_len(leaf, di)); | ||
69 | ret = btrfs_dir_data_len(leaf, di); | ||
70 | |||
71 | out: | ||
72 | btrfs_free_path(path); | ||
73 | return ret; | ||
74 | } | ||
75 | |||
76 | int __btrfs_setxattr(struct inode *inode, const char *name, | ||
77 | const void *value, size_t size, int flags) | ||
78 | { | ||
79 | struct btrfs_dir_item *di; | ||
80 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
81 | struct btrfs_trans_handle *trans; | ||
82 | struct btrfs_path *path; | ||
83 | int ret = 0, mod = 0; | ||
84 | |||
85 | path = btrfs_alloc_path(); | ||
86 | if (!path) | ||
87 | return -ENOMEM; | ||
88 | |||
89 | trans = btrfs_start_transaction(root, 1); | ||
90 | btrfs_set_trans_block_group(trans, inode); | ||
91 | |||
92 | /* first lets see if we already have this xattr */ | ||
93 | di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, | ||
94 | strlen(name), -1); | ||
95 | if (IS_ERR(di)) { | ||
96 | ret = PTR_ERR(di); | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | /* ok we already have this xattr, lets remove it */ | ||
101 | if (di) { | ||
102 | /* if we want create only exit */ | ||
103 | if (flags & XATTR_CREATE) { | ||
104 | ret = -EEXIST; | ||
105 | goto out; | ||
106 | } | ||
107 | |||
108 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
109 | if (ret) | ||
110 | goto out; | ||
111 | btrfs_release_path(root, path); | ||
112 | |||
113 | /* if we don't have a value then we are removing the xattr */ | ||
114 | if (!value) { | ||
115 | mod = 1; | ||
116 | goto out; | ||
117 | } | ||
118 | } else { | ||
119 | btrfs_release_path(root, path); | ||
120 | |||
121 | if (flags & XATTR_REPLACE) { | ||
122 | /* we couldn't find the attr to replace */ | ||
123 | ret = -ENODATA; | ||
124 | goto out; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | /* ok we have to create a completely new xattr */ | ||
129 | ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), | ||
130 | value, size, inode->i_ino); | ||
131 | if (ret) | ||
132 | goto out; | ||
133 | mod = 1; | ||
134 | |||
135 | out: | ||
136 | if (mod) { | ||
137 | inode->i_ctime = CURRENT_TIME; | ||
138 | ret = btrfs_update_inode(trans, root, inode); | ||
139 | } | ||
140 | |||
141 | btrfs_end_transaction(trans, root); | ||
142 | btrfs_free_path(path); | ||
143 | return ret; | ||
144 | } | ||
145 | |||
146 | ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) | ||
147 | { | ||
148 | struct btrfs_key key, found_key; | ||
149 | struct inode *inode = dentry->d_inode; | ||
150 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
151 | struct btrfs_path *path; | ||
152 | struct btrfs_item *item; | ||
153 | struct extent_buffer *leaf; | ||
154 | struct btrfs_dir_item *di; | ||
155 | int ret = 0, slot, advance; | ||
156 | size_t total_size = 0, size_left = size; | ||
157 | unsigned long name_ptr; | ||
158 | size_t name_len; | ||
159 | u32 nritems; | ||
160 | |||
161 | /* | ||
162 | * ok we want all objects associated with this id. | ||
163 | * NOTE: we set key.offset = 0; because we want to start with the | ||
164 | * first xattr that we find and walk forward | ||
165 | */ | ||
166 | key.objectid = inode->i_ino; | ||
167 | btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); | ||
168 | key.offset = 0; | ||
169 | |||
170 | path = btrfs_alloc_path(); | ||
171 | if (!path) | ||
172 | return -ENOMEM; | ||
173 | path->reada = 2; | ||
174 | |||
175 | /* search for our xattrs */ | ||
176 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
177 | if (ret < 0) | ||
178 | goto err; | ||
179 | ret = 0; | ||
180 | advance = 0; | ||
181 | while (1) { | ||
182 | leaf = path->nodes[0]; | ||
183 | nritems = btrfs_header_nritems(leaf); | ||
184 | slot = path->slots[0]; | ||
185 | |||
186 | /* this is where we start walking through the path */ | ||
187 | if (advance || slot >= nritems) { | ||
188 | /* | ||
189 | * if we've reached the last slot in this leaf we need | ||
190 | * to go to the next leaf and reset everything | ||
191 | */ | ||
192 | if (slot >= nritems-1) { | ||
193 | ret = btrfs_next_leaf(root, path); | ||
194 | if (ret) | ||
195 | break; | ||
196 | leaf = path->nodes[0]; | ||
197 | nritems = btrfs_header_nritems(leaf); | ||
198 | slot = path->slots[0]; | ||
199 | } else { | ||
200 | /* | ||
201 | * just walking through the slots on this leaf | ||
202 | */ | ||
203 | slot++; | ||
204 | path->slots[0]++; | ||
205 | } | ||
206 | } | ||
207 | advance = 1; | ||
208 | |||
209 | item = btrfs_item_nr(leaf, slot); | ||
210 | btrfs_item_key_to_cpu(leaf, &found_key, slot); | ||
211 | |||
212 | /* check to make sure this item is what we want */ | ||
213 | if (found_key.objectid != key.objectid) | ||
214 | break; | ||
215 | if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) | ||
216 | break; | ||
217 | |||
218 | di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); | ||
219 | |||
220 | name_len = btrfs_dir_name_len(leaf, di); | ||
221 | total_size += name_len + 1; | ||
222 | |||
223 | /* we are just looking for how big our buffer needs to be */ | ||
224 | if (!size) | ||
225 | continue; | ||
226 | |||
227 | if (!buffer || (name_len + 1) > size_left) { | ||
228 | ret = -ERANGE; | ||
229 | goto err; | ||
230 | } | ||
231 | |||
232 | name_ptr = (unsigned long)(di + 1); | ||
233 | read_extent_buffer(leaf, buffer, name_ptr, name_len); | ||
234 | buffer[name_len] = '\0'; | ||
235 | |||
236 | size_left -= name_len + 1; | ||
237 | buffer += name_len + 1; | ||
238 | } | ||
239 | ret = total_size; | ||
240 | |||
241 | err: | ||
242 | btrfs_free_path(path); | ||
243 | |||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * List of handlers for synthetic system.* attributes. All real ondisk | ||
249 | * attributes are handled directly. | ||
250 | */ | ||
251 | struct xattr_handler *btrfs_xattr_handlers[] = { | ||
252 | #ifdef CONFIG_FS_POSIX_ACL | ||
253 | &btrfs_xattr_acl_access_handler, | ||
254 | &btrfs_xattr_acl_default_handler, | ||
255 | #endif | ||
256 | NULL, | ||
257 | }; | ||
258 | |||
259 | /* | ||
260 | * Check if the attribute is in a supported namespace. | ||
261 | * | ||
262 | * This applied after the check for the synthetic attributes in the system | ||
263 | * namespace. | ||
264 | */ | ||
265 | static bool btrfs_is_valid_xattr(const char *name) | ||
266 | { | ||
267 | return !strncmp(name, XATTR_SECURITY_PREFIX, | ||
268 | XATTR_SECURITY_PREFIX_LEN) || | ||
269 | !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || | ||
270 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | ||
271 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); | ||
272 | } | ||
273 | |||
274 | ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, | ||
275 | void *buffer, size_t size) | ||
276 | { | ||
277 | /* | ||
278 | * If this is a request for a synthetic attribute in the system.* | ||
279 | * namespace use the generic infrastructure to resolve a handler | ||
280 | * for it via sb->s_xattr. | ||
281 | */ | ||
282 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
283 | return generic_getxattr(dentry, name, buffer, size); | ||
284 | |||
285 | if (!btrfs_is_valid_xattr(name)) | ||
286 | return -EOPNOTSUPP; | ||
287 | return __btrfs_getxattr(dentry->d_inode, name, buffer, size); | ||
288 | } | ||
289 | |||
290 | int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, | ||
291 | size_t size, int flags) | ||
292 | { | ||
293 | /* | ||
294 | * If this is a request for a synthetic attribute in the system.* | ||
295 | * namespace use the generic infrastructure to resolve a handler | ||
296 | * for it via sb->s_xattr. | ||
297 | */ | ||
298 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
299 | return generic_setxattr(dentry, name, value, size, flags); | ||
300 | |||
301 | if (!btrfs_is_valid_xattr(name)) | ||
302 | return -EOPNOTSUPP; | ||
303 | |||
304 | if (size == 0) | ||
305 | value = ""; /* empty EA, do not remove */ | ||
306 | return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); | ||
307 | } | ||
308 | |||
309 | int btrfs_removexattr(struct dentry *dentry, const char *name) | ||
310 | { | ||
311 | /* | ||
312 | * If this is a request for a synthetic attribute in the system.* | ||
313 | * namespace use the generic infrastructure to resolve a handler | ||
314 | * for it via sb->s_xattr. | ||
315 | */ | ||
316 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
317 | return generic_removexattr(dentry, name); | ||
318 | |||
319 | if (!btrfs_is_valid_xattr(name)) | ||
320 | return -EOPNOTSUPP; | ||
321 | return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); | ||
322 | } | ||
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 000000000000..5b1d08f8e68d --- /dev/null +++ b/fs/btrfs/xattr.h | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __XATTR__ | ||
20 | #define __XATTR__ | ||
21 | |||
22 | #include <linux/xattr.h> | ||
23 | |||
24 | extern struct xattr_handler btrfs_xattr_acl_access_handler; | ||
25 | extern struct xattr_handler btrfs_xattr_acl_default_handler; | ||
26 | extern struct xattr_handler *btrfs_xattr_handlers[]; | ||
27 | |||
28 | extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, | ||
29 | void *buffer, size_t size); | ||
30 | extern int __btrfs_setxattr(struct inode *inode, const char *name, | ||
31 | const void *value, size_t size, int flags); | ||
32 | |||
33 | extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, | ||
34 | void *buffer, size_t size); | ||
35 | extern int btrfs_setxattr(struct dentry *dentry, const char *name, | ||
36 | const void *value, size_t size, int flags); | ||
37 | extern int btrfs_removexattr(struct dentry *dentry, const char *name); | ||
38 | |||
39 | #endif /* __XATTR__ */ | ||
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 000000000000..ecfbce836d32 --- /dev/null +++ b/fs/btrfs/zlib.c | |||
@@ -0,0 +1,632 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | * | ||
18 | * Based on jffs2 zlib code: | ||
19 | * Copyright © 2001-2007 Red Hat, Inc. | ||
20 | * Created by David Woodhouse <dwmw2@infradead.org> | ||
21 | */ | ||
22 | |||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/zlib.h> | ||
26 | #include <linux/zutil.h> | ||
27 | #include <linux/vmalloc.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/err.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/pagemap.h> | ||
32 | #include <linux/bio.h> | ||
33 | #include "compression.h" | ||
34 | |||
35 | /* Plan: call deflate() with avail_in == *sourcelen, | ||
36 | avail_out = *dstlen - 12 and flush == Z_FINISH. | ||
37 | If it doesn't manage to finish, call it again with | ||
38 | avail_in == 0 and avail_out set to the remaining 12 | ||
39 | bytes for it to clean up. | ||
40 | Q: Is 12 bytes sufficient? | ||
41 | */ | ||
42 | #define STREAM_END_SPACE 12 | ||
43 | |||
44 | struct workspace { | ||
45 | z_stream inf_strm; | ||
46 | z_stream def_strm; | ||
47 | char *buf; | ||
48 | struct list_head list; | ||
49 | }; | ||
50 | |||
51 | static LIST_HEAD(idle_workspace); | ||
52 | static DEFINE_SPINLOCK(workspace_lock); | ||
53 | static unsigned long num_workspace; | ||
54 | static atomic_t alloc_workspace = ATOMIC_INIT(0); | ||
55 | static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); | ||
56 | |||
57 | /* | ||
58 | * this finds an available zlib workspace or allocates a new one | ||
59 | * NULL or an ERR_PTR is returned if things go bad. | ||
60 | */ | ||
61 | static struct workspace *find_zlib_workspace(void) | ||
62 | { | ||
63 | struct workspace *workspace; | ||
64 | int ret; | ||
65 | int cpus = num_online_cpus(); | ||
66 | |||
67 | again: | ||
68 | spin_lock(&workspace_lock); | ||
69 | if (!list_empty(&idle_workspace)) { | ||
70 | workspace = list_entry(idle_workspace.next, struct workspace, | ||
71 | list); | ||
72 | list_del(&workspace->list); | ||
73 | num_workspace--; | ||
74 | spin_unlock(&workspace_lock); | ||
75 | return workspace; | ||
76 | |||
77 | } | ||
78 | spin_unlock(&workspace_lock); | ||
79 | if (atomic_read(&alloc_workspace) > cpus) { | ||
80 | DEFINE_WAIT(wait); | ||
81 | prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); | ||
82 | if (atomic_read(&alloc_workspace) > cpus) | ||
83 | schedule(); | ||
84 | finish_wait(&workspace_wait, &wait); | ||
85 | goto again; | ||
86 | } | ||
87 | atomic_inc(&alloc_workspace); | ||
88 | workspace = kzalloc(sizeof(*workspace), GFP_NOFS); | ||
89 | if (!workspace) { | ||
90 | ret = -ENOMEM; | ||
91 | goto fail; | ||
92 | } | ||
93 | |||
94 | workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); | ||
95 | if (!workspace->def_strm.workspace) { | ||
96 | ret = -ENOMEM; | ||
97 | goto fail; | ||
98 | } | ||
99 | workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); | ||
100 | if (!workspace->inf_strm.workspace) { | ||
101 | ret = -ENOMEM; | ||
102 | goto fail_inflate; | ||
103 | } | ||
104 | workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); | ||
105 | if (!workspace->buf) { | ||
106 | ret = -ENOMEM; | ||
107 | goto fail_kmalloc; | ||
108 | } | ||
109 | return workspace; | ||
110 | |||
111 | fail_kmalloc: | ||
112 | vfree(workspace->inf_strm.workspace); | ||
113 | fail_inflate: | ||
114 | vfree(workspace->def_strm.workspace); | ||
115 | fail: | ||
116 | kfree(workspace); | ||
117 | atomic_dec(&alloc_workspace); | ||
118 | wake_up(&workspace_wait); | ||
119 | return ERR_PTR(ret); | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * put a workspace struct back on the list or free it if we have enough | ||
124 | * idle ones sitting around | ||
125 | */ | ||
126 | static int free_workspace(struct workspace *workspace) | ||
127 | { | ||
128 | spin_lock(&workspace_lock); | ||
129 | if (num_workspace < num_online_cpus()) { | ||
130 | list_add_tail(&workspace->list, &idle_workspace); | ||
131 | num_workspace++; | ||
132 | spin_unlock(&workspace_lock); | ||
133 | if (waitqueue_active(&workspace_wait)) | ||
134 | wake_up(&workspace_wait); | ||
135 | return 0; | ||
136 | } | ||
137 | spin_unlock(&workspace_lock); | ||
138 | vfree(workspace->def_strm.workspace); | ||
139 | vfree(workspace->inf_strm.workspace); | ||
140 | kfree(workspace->buf); | ||
141 | kfree(workspace); | ||
142 | |||
143 | atomic_dec(&alloc_workspace); | ||
144 | if (waitqueue_active(&workspace_wait)) | ||
145 | wake_up(&workspace_wait); | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * cleanup function for module exit | ||
151 | */ | ||
152 | static void free_workspaces(void) | ||
153 | { | ||
154 | struct workspace *workspace; | ||
155 | while (!list_empty(&idle_workspace)) { | ||
156 | workspace = list_entry(idle_workspace.next, struct workspace, | ||
157 | list); | ||
158 | list_del(&workspace->list); | ||
159 | vfree(workspace->def_strm.workspace); | ||
160 | vfree(workspace->inf_strm.workspace); | ||
161 | kfree(workspace->buf); | ||
162 | kfree(workspace); | ||
163 | atomic_dec(&alloc_workspace); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * given an address space and start/len, compress the bytes. | ||
169 | * | ||
170 | * pages are allocated to hold the compressed result and stored | ||
171 | * in 'pages' | ||
172 | * | ||
173 | * out_pages is used to return the number of pages allocated. There | ||
174 | * may be pages allocated even if we return an error | ||
175 | * | ||
176 | * total_in is used to return the number of bytes actually read. It | ||
177 | * may be smaller then len if we had to exit early because we | ||
178 | * ran out of room in the pages array or because we cross the | ||
179 | * max_out threshold. | ||
180 | * | ||
181 | * total_out is used to return the total number of compressed bytes | ||
182 | * | ||
183 | * max_out tells us the max number of bytes that we're allowed to | ||
184 | * stuff into pages | ||
185 | */ | ||
186 | int btrfs_zlib_compress_pages(struct address_space *mapping, | ||
187 | u64 start, unsigned long len, | ||
188 | struct page **pages, | ||
189 | unsigned long nr_dest_pages, | ||
190 | unsigned long *out_pages, | ||
191 | unsigned long *total_in, | ||
192 | unsigned long *total_out, | ||
193 | unsigned long max_out) | ||
194 | { | ||
195 | int ret; | ||
196 | struct workspace *workspace; | ||
197 | char *data_in; | ||
198 | char *cpage_out; | ||
199 | int nr_pages = 0; | ||
200 | struct page *in_page = NULL; | ||
201 | struct page *out_page = NULL; | ||
202 | int out_written = 0; | ||
203 | int in_read = 0; | ||
204 | unsigned long bytes_left; | ||
205 | |||
206 | *out_pages = 0; | ||
207 | *total_out = 0; | ||
208 | *total_in = 0; | ||
209 | |||
210 | workspace = find_zlib_workspace(); | ||
211 | if (!workspace) | ||
212 | return -1; | ||
213 | |||
214 | if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { | ||
215 | printk(KERN_WARNING "deflateInit failed\n"); | ||
216 | ret = -1; | ||
217 | goto out; | ||
218 | } | ||
219 | |||
220 | workspace->def_strm.total_in = 0; | ||
221 | workspace->def_strm.total_out = 0; | ||
222 | |||
223 | in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); | ||
224 | data_in = kmap(in_page); | ||
225 | |||
226 | out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
227 | cpage_out = kmap(out_page); | ||
228 | pages[0] = out_page; | ||
229 | nr_pages = 1; | ||
230 | |||
231 | workspace->def_strm.next_in = data_in; | ||
232 | workspace->def_strm.next_out = cpage_out; | ||
233 | workspace->def_strm.avail_out = PAGE_CACHE_SIZE; | ||
234 | workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); | ||
235 | |||
236 | out_written = 0; | ||
237 | in_read = 0; | ||
238 | |||
239 | while (workspace->def_strm.total_in < len) { | ||
240 | ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); | ||
241 | if (ret != Z_OK) { | ||
242 | printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", | ||
243 | ret); | ||
244 | zlib_deflateEnd(&workspace->def_strm); | ||
245 | ret = -1; | ||
246 | goto out; | ||
247 | } | ||
248 | |||
249 | /* we're making it bigger, give up */ | ||
250 | if (workspace->def_strm.total_in > 8192 && | ||
251 | workspace->def_strm.total_in < | ||
252 | workspace->def_strm.total_out) { | ||
253 | ret = -1; | ||
254 | goto out; | ||
255 | } | ||
256 | /* we need another page for writing out. Test this | ||
257 | * before the total_in so we will pull in a new page for | ||
258 | * the stream end if required | ||
259 | */ | ||
260 | if (workspace->def_strm.avail_out == 0) { | ||
261 | kunmap(out_page); | ||
262 | if (nr_pages == nr_dest_pages) { | ||
263 | out_page = NULL; | ||
264 | ret = -1; | ||
265 | goto out; | ||
266 | } | ||
267 | out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
268 | cpage_out = kmap(out_page); | ||
269 | pages[nr_pages] = out_page; | ||
270 | nr_pages++; | ||
271 | workspace->def_strm.avail_out = PAGE_CACHE_SIZE; | ||
272 | workspace->def_strm.next_out = cpage_out; | ||
273 | } | ||
274 | /* we're all done */ | ||
275 | if (workspace->def_strm.total_in >= len) | ||
276 | break; | ||
277 | |||
278 | /* we've read in a full page, get a new one */ | ||
279 | if (workspace->def_strm.avail_in == 0) { | ||
280 | if (workspace->def_strm.total_out > max_out) | ||
281 | break; | ||
282 | |||
283 | bytes_left = len - workspace->def_strm.total_in; | ||
284 | kunmap(in_page); | ||
285 | page_cache_release(in_page); | ||
286 | |||
287 | start += PAGE_CACHE_SIZE; | ||
288 | in_page = find_get_page(mapping, | ||
289 | start >> PAGE_CACHE_SHIFT); | ||
290 | data_in = kmap(in_page); | ||
291 | workspace->def_strm.avail_in = min(bytes_left, | ||
292 | PAGE_CACHE_SIZE); | ||
293 | workspace->def_strm.next_in = data_in; | ||
294 | } | ||
295 | } | ||
296 | workspace->def_strm.avail_in = 0; | ||
297 | ret = zlib_deflate(&workspace->def_strm, Z_FINISH); | ||
298 | zlib_deflateEnd(&workspace->def_strm); | ||
299 | |||
300 | if (ret != Z_STREAM_END) { | ||
301 | ret = -1; | ||
302 | goto out; | ||
303 | } | ||
304 | |||
305 | if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { | ||
306 | ret = -1; | ||
307 | goto out; | ||
308 | } | ||
309 | |||
310 | ret = 0; | ||
311 | *total_out = workspace->def_strm.total_out; | ||
312 | *total_in = workspace->def_strm.total_in; | ||
313 | out: | ||
314 | *out_pages = nr_pages; | ||
315 | if (out_page) | ||
316 | kunmap(out_page); | ||
317 | |||
318 | if (in_page) { | ||
319 | kunmap(in_page); | ||
320 | page_cache_release(in_page); | ||
321 | } | ||
322 | free_workspace(workspace); | ||
323 | return ret; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * pages_in is an array of pages with compressed data. | ||
328 | * | ||
329 | * disk_start is the starting logical offset of this array in the file | ||
330 | * | ||
331 | * bvec is a bio_vec of pages from the file that we want to decompress into | ||
332 | * | ||
333 | * vcnt is the count of pages in the biovec | ||
334 | * | ||
335 | * srclen is the number of bytes in pages_in | ||
336 | * | ||
337 | * The basic idea is that we have a bio that was created by readpages. | ||
338 | * The pages in the bio are for the uncompressed data, and they may not | ||
339 | * be contiguous. They all correspond to the range of bytes covered by | ||
340 | * the compressed extent. | ||
341 | */ | ||
342 | int btrfs_zlib_decompress_biovec(struct page **pages_in, | ||
343 | u64 disk_start, | ||
344 | struct bio_vec *bvec, | ||
345 | int vcnt, | ||
346 | size_t srclen) | ||
347 | { | ||
348 | int ret = 0; | ||
349 | int wbits = MAX_WBITS; | ||
350 | struct workspace *workspace; | ||
351 | char *data_in; | ||
352 | size_t total_out = 0; | ||
353 | unsigned long page_bytes_left; | ||
354 | unsigned long page_in_index = 0; | ||
355 | unsigned long page_out_index = 0; | ||
356 | struct page *page_out; | ||
357 | unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / | ||
358 | PAGE_CACHE_SIZE; | ||
359 | unsigned long buf_start; | ||
360 | unsigned long buf_offset; | ||
361 | unsigned long bytes; | ||
362 | unsigned long working_bytes; | ||
363 | unsigned long pg_offset; | ||
364 | unsigned long start_byte; | ||
365 | unsigned long current_buf_start; | ||
366 | char *kaddr; | ||
367 | |||
368 | workspace = find_zlib_workspace(); | ||
369 | if (!workspace) | ||
370 | return -ENOMEM; | ||
371 | |||
372 | data_in = kmap(pages_in[page_in_index]); | ||
373 | workspace->inf_strm.next_in = data_in; | ||
374 | workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); | ||
375 | workspace->inf_strm.total_in = 0; | ||
376 | |||
377 | workspace->inf_strm.total_out = 0; | ||
378 | workspace->inf_strm.next_out = workspace->buf; | ||
379 | workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; | ||
380 | page_out = bvec[page_out_index].bv_page; | ||
381 | page_bytes_left = PAGE_CACHE_SIZE; | ||
382 | pg_offset = 0; | ||
383 | |||
384 | /* If it's deflate, and it's got no preset dictionary, then | ||
385 | we can tell zlib to skip the adler32 check. */ | ||
386 | if (srclen > 2 && !(data_in[1] & PRESET_DICT) && | ||
387 | ((data_in[0] & 0x0f) == Z_DEFLATED) && | ||
388 | !(((data_in[0]<<8) + data_in[1]) % 31)) { | ||
389 | |||
390 | wbits = -((data_in[0] >> 4) + 8); | ||
391 | workspace->inf_strm.next_in += 2; | ||
392 | workspace->inf_strm.avail_in -= 2; | ||
393 | } | ||
394 | |||
395 | if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { | ||
396 | printk(KERN_WARNING "inflateInit failed\n"); | ||
397 | ret = -1; | ||
398 | goto out; | ||
399 | } | ||
400 | while (workspace->inf_strm.total_in < srclen) { | ||
401 | ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); | ||
402 | if (ret != Z_OK && ret != Z_STREAM_END) | ||
403 | break; | ||
404 | /* | ||
405 | * buf start is the byte offset we're of the start of | ||
406 | * our workspace buffer | ||
407 | */ | ||
408 | buf_start = total_out; | ||
409 | |||
410 | /* total_out is the last byte of the workspace buffer */ | ||
411 | total_out = workspace->inf_strm.total_out; | ||
412 | |||
413 | working_bytes = total_out - buf_start; | ||
414 | |||
415 | /* | ||
416 | * start byte is the first byte of the page we're currently | ||
417 | * copying into relative to the start of the compressed data. | ||
418 | */ | ||
419 | start_byte = page_offset(page_out) - disk_start; | ||
420 | |||
421 | if (working_bytes == 0) { | ||
422 | /* we didn't make progress in this inflate | ||
423 | * call, we're done | ||
424 | */ | ||
425 | if (ret != Z_STREAM_END) | ||
426 | ret = -1; | ||
427 | break; | ||
428 | } | ||
429 | |||
430 | /* we haven't yet hit data corresponding to this page */ | ||
431 | if (total_out <= start_byte) | ||
432 | goto next; | ||
433 | |||
434 | /* | ||
435 | * the start of the data we care about is offset into | ||
436 | * the middle of our working buffer | ||
437 | */ | ||
438 | if (total_out > start_byte && buf_start < start_byte) { | ||
439 | buf_offset = start_byte - buf_start; | ||
440 | working_bytes -= buf_offset; | ||
441 | } else { | ||
442 | buf_offset = 0; | ||
443 | } | ||
444 | current_buf_start = buf_start; | ||
445 | |||
446 | /* copy bytes from the working buffer into the pages */ | ||
447 | while (working_bytes > 0) { | ||
448 | bytes = min(PAGE_CACHE_SIZE - pg_offset, | ||
449 | PAGE_CACHE_SIZE - buf_offset); | ||
450 | bytes = min(bytes, working_bytes); | ||
451 | kaddr = kmap_atomic(page_out, KM_USER0); | ||
452 | memcpy(kaddr + pg_offset, workspace->buf + buf_offset, | ||
453 | bytes); | ||
454 | kunmap_atomic(kaddr, KM_USER0); | ||
455 | flush_dcache_page(page_out); | ||
456 | |||
457 | pg_offset += bytes; | ||
458 | page_bytes_left -= bytes; | ||
459 | buf_offset += bytes; | ||
460 | working_bytes -= bytes; | ||
461 | current_buf_start += bytes; | ||
462 | |||
463 | /* check if we need to pick another page */ | ||
464 | if (page_bytes_left == 0) { | ||
465 | page_out_index++; | ||
466 | if (page_out_index >= vcnt) { | ||
467 | ret = 0; | ||
468 | goto done; | ||
469 | } | ||
470 | |||
471 | page_out = bvec[page_out_index].bv_page; | ||
472 | pg_offset = 0; | ||
473 | page_bytes_left = PAGE_CACHE_SIZE; | ||
474 | start_byte = page_offset(page_out) - disk_start; | ||
475 | |||
476 | /* | ||
477 | * make sure our new page is covered by this | ||
478 | * working buffer | ||
479 | */ | ||
480 | if (total_out <= start_byte) | ||
481 | goto next; | ||
482 | |||
483 | /* the next page in the biovec might not | ||
484 | * be adjacent to the last page, but it | ||
485 | * might still be found inside this working | ||
486 | * buffer. bump our offset pointer | ||
487 | */ | ||
488 | if (total_out > start_byte && | ||
489 | current_buf_start < start_byte) { | ||
490 | buf_offset = start_byte - buf_start; | ||
491 | working_bytes = total_out - start_byte; | ||
492 | current_buf_start = buf_start + | ||
493 | buf_offset; | ||
494 | } | ||
495 | } | ||
496 | } | ||
497 | next: | ||
498 | workspace->inf_strm.next_out = workspace->buf; | ||
499 | workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; | ||
500 | |||
501 | if (workspace->inf_strm.avail_in == 0) { | ||
502 | unsigned long tmp; | ||
503 | kunmap(pages_in[page_in_index]); | ||
504 | page_in_index++; | ||
505 | if (page_in_index >= total_pages_in) { | ||
506 | data_in = NULL; | ||
507 | break; | ||
508 | } | ||
509 | data_in = kmap(pages_in[page_in_index]); | ||
510 | workspace->inf_strm.next_in = data_in; | ||
511 | tmp = srclen - workspace->inf_strm.total_in; | ||
512 | workspace->inf_strm.avail_in = min(tmp, | ||
513 | PAGE_CACHE_SIZE); | ||
514 | } | ||
515 | } | ||
516 | if (ret != Z_STREAM_END) | ||
517 | ret = -1; | ||
518 | else | ||
519 | ret = 0; | ||
520 | done: | ||
521 | zlib_inflateEnd(&workspace->inf_strm); | ||
522 | if (data_in) | ||
523 | kunmap(pages_in[page_in_index]); | ||
524 | out: | ||
525 | free_workspace(workspace); | ||
526 | return ret; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * a less complex decompression routine. Our compressed data fits in a | ||
531 | * single page, and we want to read a single page out of it. | ||
532 | * start_byte tells us the offset into the compressed data we're interested in | ||
533 | */ | ||
534 | int btrfs_zlib_decompress(unsigned char *data_in, | ||
535 | struct page *dest_page, | ||
536 | unsigned long start_byte, | ||
537 | size_t srclen, size_t destlen) | ||
538 | { | ||
539 | int ret = 0; | ||
540 | int wbits = MAX_WBITS; | ||
541 | struct workspace *workspace; | ||
542 | unsigned long bytes_left = destlen; | ||
543 | unsigned long total_out = 0; | ||
544 | char *kaddr; | ||
545 | |||
546 | if (destlen > PAGE_CACHE_SIZE) | ||
547 | return -ENOMEM; | ||
548 | |||
549 | workspace = find_zlib_workspace(); | ||
550 | if (!workspace) | ||
551 | return -ENOMEM; | ||
552 | |||
553 | workspace->inf_strm.next_in = data_in; | ||
554 | workspace->inf_strm.avail_in = srclen; | ||
555 | workspace->inf_strm.total_in = 0; | ||
556 | |||
557 | workspace->inf_strm.next_out = workspace->buf; | ||
558 | workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; | ||
559 | workspace->inf_strm.total_out = 0; | ||
560 | /* If it's deflate, and it's got no preset dictionary, then | ||
561 | we can tell zlib to skip the adler32 check. */ | ||
562 | if (srclen > 2 && !(data_in[1] & PRESET_DICT) && | ||
563 | ((data_in[0] & 0x0f) == Z_DEFLATED) && | ||
564 | !(((data_in[0]<<8) + data_in[1]) % 31)) { | ||
565 | |||
566 | wbits = -((data_in[0] >> 4) + 8); | ||
567 | workspace->inf_strm.next_in += 2; | ||
568 | workspace->inf_strm.avail_in -= 2; | ||
569 | } | ||
570 | |||
571 | if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { | ||
572 | printk(KERN_WARNING "inflateInit failed\n"); | ||
573 | ret = -1; | ||
574 | goto out; | ||
575 | } | ||
576 | |||
577 | while (bytes_left > 0) { | ||
578 | unsigned long buf_start; | ||
579 | unsigned long buf_offset; | ||
580 | unsigned long bytes; | ||
581 | unsigned long pg_offset = 0; | ||
582 | |||
583 | ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); | ||
584 | if (ret != Z_OK && ret != Z_STREAM_END) | ||
585 | break; | ||
586 | |||
587 | buf_start = total_out; | ||
588 | total_out = workspace->inf_strm.total_out; | ||
589 | |||
590 | if (total_out == buf_start) { | ||
591 | ret = -1; | ||
592 | break; | ||
593 | } | ||
594 | |||
595 | if (total_out <= start_byte) | ||
596 | goto next; | ||
597 | |||
598 | if (total_out > start_byte && buf_start < start_byte) | ||
599 | buf_offset = start_byte - buf_start; | ||
600 | else | ||
601 | buf_offset = 0; | ||
602 | |||
603 | bytes = min(PAGE_CACHE_SIZE - pg_offset, | ||
604 | PAGE_CACHE_SIZE - buf_offset); | ||
605 | bytes = min(bytes, bytes_left); | ||
606 | |||
607 | kaddr = kmap_atomic(dest_page, KM_USER0); | ||
608 | memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); | ||
609 | kunmap_atomic(kaddr, KM_USER0); | ||
610 | |||
611 | pg_offset += bytes; | ||
612 | bytes_left -= bytes; | ||
613 | next: | ||
614 | workspace->inf_strm.next_out = workspace->buf; | ||
615 | workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; | ||
616 | } | ||
617 | |||
618 | if (ret != Z_STREAM_END && bytes_left != 0) | ||
619 | ret = -1; | ||
620 | else | ||
621 | ret = 0; | ||
622 | |||
623 | zlib_inflateEnd(&workspace->inf_strm); | ||
624 | out: | ||
625 | free_workspace(workspace); | ||
626 | return ret; | ||
627 | } | ||
628 | |||
629 | void btrfs_zlib_exit(void) | ||
630 | { | ||
631 | free_workspaces(); | ||
632 | } | ||
diff --git a/fs/buffer.c b/fs/buffer.c index a13f09b696f7..b6e8b8632e2f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev) | |||
203 | * happen on bdev until thaw_bdev() is called. | 203 | * happen on bdev until thaw_bdev() is called. |
204 | * If a superblock is found on this device, we take the s_umount semaphore | 204 | * If a superblock is found on this device, we take the s_umount semaphore |
205 | * on it to make sure nobody unmounts until the snapshot creation is done. | 205 | * on it to make sure nobody unmounts until the snapshot creation is done. |
206 | * The reference counter (bd_fsfreeze_count) guarantees that only the last | ||
207 | * unfreeze process can unfreeze the frozen filesystem actually when multiple | ||
208 | * freeze requests arrive simultaneously. It counts up in freeze_bdev() and | ||
209 | * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze | ||
210 | * actually. | ||
206 | */ | 211 | */ |
207 | struct super_block *freeze_bdev(struct block_device *bdev) | 212 | struct super_block *freeze_bdev(struct block_device *bdev) |
208 | { | 213 | { |
209 | struct super_block *sb; | 214 | struct super_block *sb; |
215 | int error = 0; | ||
216 | |||
217 | mutex_lock(&bdev->bd_fsfreeze_mutex); | ||
218 | if (bdev->bd_fsfreeze_count > 0) { | ||
219 | bdev->bd_fsfreeze_count++; | ||
220 | sb = get_super(bdev); | ||
221 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
222 | return sb; | ||
223 | } | ||
224 | bdev->bd_fsfreeze_count++; | ||
210 | 225 | ||
211 | down(&bdev->bd_mount_sem); | 226 | down(&bdev->bd_mount_sem); |
212 | sb = get_super(bdev); | 227 | sb = get_super(bdev); |
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev) | |||
221 | 236 | ||
222 | sync_blockdev(sb->s_bdev); | 237 | sync_blockdev(sb->s_bdev); |
223 | 238 | ||
224 | if (sb->s_op->write_super_lockfs) | 239 | if (sb->s_op->freeze_fs) { |
225 | sb->s_op->write_super_lockfs(sb); | 240 | error = sb->s_op->freeze_fs(sb); |
241 | if (error) { | ||
242 | printk(KERN_ERR | ||
243 | "VFS:Filesystem freeze failed\n"); | ||
244 | sb->s_frozen = SB_UNFROZEN; | ||
245 | drop_super(sb); | ||
246 | up(&bdev->bd_mount_sem); | ||
247 | bdev->bd_fsfreeze_count--; | ||
248 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
249 | return ERR_PTR(error); | ||
250 | } | ||
251 | } | ||
226 | } | 252 | } |
227 | 253 | ||
228 | sync_blockdev(bdev); | 254 | sync_blockdev(bdev); |
255 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
256 | |||
229 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ | 257 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ |
230 | } | 258 | } |
231 | EXPORT_SYMBOL(freeze_bdev); | 259 | EXPORT_SYMBOL(freeze_bdev); |
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev); | |||
237 | * | 265 | * |
238 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). | 266 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
239 | */ | 267 | */ |
240 | void thaw_bdev(struct block_device *bdev, struct super_block *sb) | 268 | int thaw_bdev(struct block_device *bdev, struct super_block *sb) |
241 | { | 269 | { |
270 | int error = 0; | ||
271 | |||
272 | mutex_lock(&bdev->bd_fsfreeze_mutex); | ||
273 | if (!bdev->bd_fsfreeze_count) { | ||
274 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
275 | return -EINVAL; | ||
276 | } | ||
277 | |||
278 | bdev->bd_fsfreeze_count--; | ||
279 | if (bdev->bd_fsfreeze_count > 0) { | ||
280 | if (sb) | ||
281 | drop_super(sb); | ||
282 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
283 | return 0; | ||
284 | } | ||
285 | |||
242 | if (sb) { | 286 | if (sb) { |
243 | BUG_ON(sb->s_bdev != bdev); | 287 | BUG_ON(sb->s_bdev != bdev); |
244 | 288 | if (!(sb->s_flags & MS_RDONLY)) { | |
245 | if (sb->s_op->unlockfs) | 289 | if (sb->s_op->unfreeze_fs) { |
246 | sb->s_op->unlockfs(sb); | 290 | error = sb->s_op->unfreeze_fs(sb); |
247 | sb->s_frozen = SB_UNFROZEN; | 291 | if (error) { |
248 | smp_wmb(); | 292 | printk(KERN_ERR |
249 | wake_up(&sb->s_wait_unfrozen); | 293 | "VFS:Filesystem thaw failed\n"); |
294 | sb->s_frozen = SB_FREEZE_TRANS; | ||
295 | bdev->bd_fsfreeze_count++; | ||
296 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
297 | return error; | ||
298 | } | ||
299 | } | ||
300 | sb->s_frozen = SB_UNFROZEN; | ||
301 | smp_wmb(); | ||
302 | wake_up(&sb->s_wait_unfrozen); | ||
303 | } | ||
250 | drop_super(sb); | 304 | drop_super(sb); |
251 | } | 305 | } |
252 | 306 | ||
253 | up(&bdev->bd_mount_sem); | 307 | up(&bdev->bd_mount_sem); |
308 | mutex_unlock(&bdev->bd_fsfreeze_mutex); | ||
309 | return 0; | ||
254 | } | 310 | } |
255 | EXPORT_SYMBOL(thaw_bdev); | 311 | EXPORT_SYMBOL(thaw_bdev); |
256 | 312 | ||
@@ -2022,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping, | |||
2022 | if (pos + len > inode->i_size) | 2078 | if (pos + len > inode->i_size) |
2023 | vmtruncate(inode, inode->i_size); | 2079 | vmtruncate(inode, inode->i_size); |
2024 | } | 2080 | } |
2025 | goto out; | ||
2026 | } | 2081 | } |
2027 | 2082 | ||
2028 | out: | 2083 | out: |
diff --git a/fs/char_dev.c b/fs/char_dev.c index 700697a72618..38f71222a552 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c | |||
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, | |||
120 | cd->major = major; | 120 | cd->major = major; |
121 | cd->baseminor = baseminor; | 121 | cd->baseminor = baseminor; |
122 | cd->minorct = minorct; | 122 | cd->minorct = minorct; |
123 | strncpy(cd->name,name, 64); | 123 | strlcpy(cd->name, name, sizeof(cd->name)); |
124 | 124 | ||
125 | i = major_to_index(major); | 125 | i = major_to_index(major); |
126 | 126 | ||
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 81b7771c6465..43c96ce29614 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c | |||
@@ -11,7 +11,9 @@ | |||
11 | 11 | ||
12 | #include "coda_int.h" | 12 | #include "coda_int.h" |
13 | 13 | ||
14 | #ifdef CONFIG_SYSCTL | ||
14 | static struct ctl_table_header *fs_table_header; | 15 | static struct ctl_table_header *fs_table_header; |
16 | #endif | ||
15 | 17 | ||
16 | static ctl_table coda_table[] = { | 18 | static ctl_table coda_table[] = { |
17 | { | 19 | { |
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = { | |||
41 | {} | 43 | {} |
42 | }; | 44 | }; |
43 | 45 | ||
46 | #ifdef CONFIG_SYSCTL | ||
44 | static ctl_table fs_table[] = { | 47 | static ctl_table fs_table[] = { |
45 | { | 48 | { |
46 | .ctl_name = CTL_UNNUMBERED, | 49 | .ctl_name = CTL_UNNUMBERED, |
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = { | |||
50 | }, | 53 | }, |
51 | {} | 54 | {} |
52 | }; | 55 | }; |
53 | 56 | #endif | |
54 | 57 | ||
55 | void coda_sysctl_init(void) | 58 | void coda_sysctl_init(void) |
56 | { | 59 | { |
diff --git a/fs/compat.c b/fs/compat.c index d1ece79b6411..30f2faa22f5c 100644 --- a/fs/compat.c +++ b/fs/compat.c | |||
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign | |||
1187 | ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); | 1187 | ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); |
1188 | 1188 | ||
1189 | out: | 1189 | out: |
1190 | if (ret > 0) | ||
1191 | add_rchar(current, ret); | ||
1192 | inc_syscr(current); | ||
1190 | fput(file); | 1193 | fput(file); |
1191 | return ret; | 1194 | return ret; |
1192 | } | 1195 | } |
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig | |||
1210 | ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); | 1213 | ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); |
1211 | 1214 | ||
1212 | out: | 1215 | out: |
1216 | if (ret > 0) | ||
1217 | add_wchar(current, ret); | ||
1218 | inc_syscw(current); | ||
1213 | fput(file); | 1219 | fput(file); |
1214 | return ret; | 1220 | return ret; |
1215 | } | 1221 | } |
diff --git a/fs/dcache.c b/fs/dcache.c index e88c23b85a32..4547f66884a0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry) | |||
1567 | spin_unlock(&dcache_lock); | 1567 | spin_unlock(&dcache_lock); |
1568 | } | 1568 | } |
1569 | 1569 | ||
1570 | #define do_switch(x,y) do { \ | ||
1571 | __typeof__ (x) __tmp = x; \ | ||
1572 | x = y; y = __tmp; } while (0) | ||
1573 | |||
1574 | /* | 1570 | /* |
1575 | * When switching names, the actual string doesn't strictly have to | 1571 | * When switching names, the actual string doesn't strictly have to |
1576 | * be preserved in the target - because we're dropping the target | 1572 | * be preserved in the target - because we're dropping the target |
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target) | |||
1589 | /* | 1585 | /* |
1590 | * Both external: swap the pointers | 1586 | * Both external: swap the pointers |
1591 | */ | 1587 | */ |
1592 | do_switch(target->d_name.name, dentry->d_name.name); | 1588 | swap(target->d_name.name, dentry->d_name.name); |
1593 | } else { | 1589 | } else { |
1594 | /* | 1590 | /* |
1595 | * dentry:internal, target:external. Steal target's | 1591 | * dentry:internal, target:external. Steal target's |
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target) | |||
1620 | return; | 1616 | return; |
1621 | } | 1617 | } |
1622 | } | 1618 | } |
1623 | do_switch(dentry->d_name.len, target->d_name.len); | 1619 | swap(dentry->d_name.len, target->d_name.len); |
1624 | } | 1620 | } |
1625 | 1621 | ||
1626 | /* | 1622 | /* |
@@ -1680,7 +1676,7 @@ already_unhashed: | |||
1680 | 1676 | ||
1681 | /* Switch the names.. */ | 1677 | /* Switch the names.. */ |
1682 | switch_names(dentry, target); | 1678 | switch_names(dentry, target); |
1683 | do_switch(dentry->d_name.hash, target->d_name.hash); | 1679 | swap(dentry->d_name.hash, target->d_name.hash); |
1684 | 1680 | ||
1685 | /* ... and switch the parents */ | 1681 | /* ... and switch the parents */ |
1686 | if (IS_ROOT(dentry)) { | 1682 | if (IS_ROOT(dentry)) { |
@@ -1688,7 +1684,7 @@ already_unhashed: | |||
1688 | target->d_parent = target; | 1684 | target->d_parent = target; |
1689 | INIT_LIST_HEAD(&target->d_u.d_child); | 1685 | INIT_LIST_HEAD(&target->d_u.d_child); |
1690 | } else { | 1686 | } else { |
1691 | do_switch(dentry->d_parent, target->d_parent); | 1687 | swap(dentry->d_parent, target->d_parent); |
1692 | 1688 | ||
1693 | /* And add them back to the (new) parent lists */ | 1689 | /* And add them back to the (new) parent lists */ |
1694 | list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); | 1690 | list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); |
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) | |||
1789 | struct dentry *dparent, *aparent; | 1785 | struct dentry *dparent, *aparent; |
1790 | 1786 | ||
1791 | switch_names(dentry, anon); | 1787 | switch_names(dentry, anon); |
1792 | do_switch(dentry->d_name.hash, anon->d_name.hash); | 1788 | swap(dentry->d_name.hash, anon->d_name.hash); |
1793 | 1789 | ||
1794 | dparent = dentry->d_parent; | 1790 | dparent = dentry->d_parent; |
1795 | aparent = anon->d_parent; | 1791 | aparent = anon->d_parent; |
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 159a5efd6a8a..33a90120f6ad 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c | |||
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode, | |||
294 | } | 294 | } |
295 | EXPORT_SYMBOL_GPL(debugfs_create_x32); | 295 | EXPORT_SYMBOL_GPL(debugfs_create_x32); |
296 | 296 | ||
297 | |||
298 | static int debugfs_size_t_set(void *data, u64 val) | ||
299 | { | ||
300 | *(size_t *)data = val; | ||
301 | return 0; | ||
302 | } | ||
303 | static int debugfs_size_t_get(void *data, u64 *val) | ||
304 | { | ||
305 | *val = *(size_t *)data; | ||
306 | return 0; | ||
307 | } | ||
308 | DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, | ||
309 | "%llu\n"); /* %llu and %zu are more or less the same */ | ||
310 | |||
311 | /** | ||
312 | * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value | ||
313 | * @name: a pointer to a string containing the name of the file to create. | ||
314 | * @mode: the permission that the file should have | ||
315 | * @parent: a pointer to the parent dentry for this file. This should be a | ||
316 | * directory dentry if set. If this parameter is %NULL, then the | ||
317 | * file will be created in the root of the debugfs filesystem. | ||
318 | * @value: a pointer to the variable that the file should read to and write | ||
319 | * from. | ||
320 | */ | ||
321 | struct dentry *debugfs_create_size_t(const char *name, mode_t mode, | ||
322 | struct dentry *parent, size_t *value) | ||
323 | { | ||
324 | return debugfs_create_file(name, mode, parent, value, &fops_size_t); | ||
325 | } | ||
326 | EXPORT_SYMBOL_GPL(debugfs_create_size_t); | ||
327 | |||
328 | |||
297 | static ssize_t read_file_bool(struct file *file, char __user *user_buf, | 329 | static ssize_t read_file_bool(struct file *file, char __user *user_buf, |
298 | size_t count, loff_t *ppos) | 330 | size_t count, loff_t *ppos) |
299 | { | 331 | { |
diff --git a/fs/direct-io.c b/fs/direct-io.c index af0558dbe8b7..b6d43908ff7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1209 | retval = direct_io_worker(rw, iocb, inode, iov, offset, | 1209 | retval = direct_io_worker(rw, iocb, inode, iov, offset, |
1210 | nr_segs, blkbits, get_block, end_io, dio); | 1210 | nr_segs, blkbits, get_block, end_io, dio); |
1211 | 1211 | ||
1212 | /* | ||
1213 | * In case of error extending write may have instantiated a few | ||
1214 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | ||
1215 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by | ||
1216 | * it's own meaner. | ||
1217 | */ | ||
1218 | if (unlikely(retval < 0 && (rw & WRITE))) { | ||
1219 | loff_t isize = i_size_read(inode); | ||
1220 | |||
1221 | if (end > isize && dio_lock_type == DIO_LOCKING) | ||
1222 | vmtruncate(inode, isize); | ||
1223 | } | ||
1224 | |||
1212 | if (rw == READ && dio_lock_type == DIO_LOCKING) | 1225 | if (rw == READ && dio_lock_type == DIO_LOCKING) |
1213 | release_i_mutex = 0; | 1226 | release_i_mutex = 0; |
1214 | 1227 | ||
diff --git a/fs/dquot.c b/fs/dquot.c index 61bfff64e5af..48c0571f831d 100644 --- a/fs/dquot.c +++ b/fs/dquot.c | |||
@@ -2090,10 +2090,12 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) | |||
2090 | } | 2090 | } |
2091 | if (di->dqb_valid & QIF_BTIME) { | 2091 | if (di->dqb_valid & QIF_BTIME) { |
2092 | dm->dqb_btime = di->dqb_btime; | 2092 | dm->dqb_btime = di->dqb_btime; |
2093 | check_blim = 1; | ||
2093 | __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); | 2094 | __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); |
2094 | } | 2095 | } |
2095 | if (di->dqb_valid & QIF_ITIME) { | 2096 | if (di->dqb_valid & QIF_ITIME) { |
2096 | dm->dqb_itime = di->dqb_itime; | 2097 | dm->dqb_itime = di->dqb_itime; |
2098 | check_ilim = 1; | ||
2097 | __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); | 2099 | __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); |
2098 | } | 2100 | } |
2099 | 2101 | ||
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 6046239465a1..c01e043670e2 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c | |||
@@ -175,8 +175,8 @@ out: | |||
175 | * | 175 | * |
176 | * Returns zero on success; non-zero on error. | 176 | * Returns zero on success; non-zero on error. |
177 | */ | 177 | */ |
178 | static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, | 178 | int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, |
179 | loff_t offset) | 179 | loff_t offset) |
180 | { | 180 | { |
181 | int rc = 0; | 181 | int rc = 0; |
182 | char dst[MD5_DIGEST_SIZE]; | 182 | char dst[MD5_DIGEST_SIZE]; |
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags( | |||
924 | crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; | 924 | crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; |
925 | if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) | 925 | if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) |
926 | crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; | 926 | crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; |
927 | if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { | ||
928 | crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; | ||
929 | if (mount_crypt_stat->flags | ||
930 | & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) | ||
931 | crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; | ||
932 | else if (mount_crypt_stat->flags | ||
933 | & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) | ||
934 | crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; | ||
935 | } | ||
927 | } | 936 | } |
928 | 937 | ||
929 | static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( | 938 | static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( |
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem { | |||
1060 | static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { | 1069 | static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { |
1061 | {0x00000001, ECRYPTFS_ENABLE_HMAC}, | 1070 | {0x00000001, ECRYPTFS_ENABLE_HMAC}, |
1062 | {0x00000002, ECRYPTFS_ENCRYPTED}, | 1071 | {0x00000002, ECRYPTFS_ENCRYPTED}, |
1063 | {0x00000004, ECRYPTFS_METADATA_IN_XATTR} | 1072 | {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, |
1073 | {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} | ||
1064 | }; | 1074 | }; |
1065 | 1075 | ||
1066 | /** | 1076 | /** |
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = { | |||
1149 | 1159 | ||
1150 | /** | 1160 | /** |
1151 | * ecryptfs_code_for_cipher_string | 1161 | * ecryptfs_code_for_cipher_string |
1152 | * @crypt_stat: The cryptographic context | 1162 | * @cipher_name: The string alias for the cipher |
1163 | * @key_bytes: Length of key in bytes; used for AES code selection | ||
1153 | * | 1164 | * |
1154 | * Returns zero on no match, or the cipher code on match | 1165 | * Returns zero on no match, or the cipher code on match |
1155 | */ | 1166 | */ |
1156 | u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) | 1167 | u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) |
1157 | { | 1168 | { |
1158 | int i; | 1169 | int i; |
1159 | u8 code = 0; | 1170 | u8 code = 0; |
1160 | struct ecryptfs_cipher_code_str_map_elem *map = | 1171 | struct ecryptfs_cipher_code_str_map_elem *map = |
1161 | ecryptfs_cipher_code_str_map; | 1172 | ecryptfs_cipher_code_str_map; |
1162 | 1173 | ||
1163 | if (strcmp(crypt_stat->cipher, "aes") == 0) { | 1174 | if (strcmp(cipher_name, "aes") == 0) { |
1164 | switch (crypt_stat->key_size) { | 1175 | switch (key_bytes) { |
1165 | case 16: | 1176 | case 16: |
1166 | code = RFC2440_CIPHER_AES_128; | 1177 | code = RFC2440_CIPHER_AES_128; |
1167 | break; | 1178 | break; |
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) | |||
1173 | } | 1184 | } |
1174 | } else { | 1185 | } else { |
1175 | for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) | 1186 | for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) |
1176 | if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ | 1187 | if (strcmp(cipher_name, map[i].cipher_str) == 0) { |
1177 | code = map[i].cipher_code; | 1188 | code = map[i].cipher_code; |
1178 | break; | 1189 | break; |
1179 | } | 1190 | } |
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data, | |||
1212 | &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); | 1223 | &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); |
1213 | int rc; | 1224 | int rc; |
1214 | 1225 | ||
1226 | if (crypt_stat->extent_size == 0) | ||
1227 | crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; | ||
1215 | rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, | 1228 | rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, |
1216 | ecryptfs_inode); | 1229 | ecryptfs_inode); |
1217 | if (rc) { | 1230 | if (rc) { |
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data, | |||
1221 | } | 1234 | } |
1222 | if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { | 1235 | if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { |
1223 | rc = -EINVAL; | 1236 | rc = -EINVAL; |
1224 | ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n"); | ||
1225 | } | 1237 | } |
1226 | out: | 1238 | out: |
1227 | return rc; | 1239 | return rc; |
@@ -1628,95 +1640,95 @@ out: | |||
1628 | } | 1640 | } |
1629 | 1641 | ||
1630 | /** | 1642 | /** |
1631 | * ecryptfs_encode_filename - converts a plaintext file name to cipher text | 1643 | * ecryptfs_encrypt_filename - encrypt filename |
1632 | * @crypt_stat: The crypt_stat struct associated with the file anem to encode | ||
1633 | * @name: The plaintext name | ||
1634 | * @length: The length of the plaintext | ||
1635 | * @encoded_name: The encypted name | ||
1636 | * | 1644 | * |
1637 | * Encrypts and encodes a filename into something that constitutes a | 1645 | * CBC-encrypts the filename. We do not want to encrypt the same |
1638 | * valid filename for a filesystem, with printable characters. | 1646 | * filename with the same key and IV, which may happen with hard |
1647 | * links, so we prepend random bits to each filename. | ||
1639 | * | 1648 | * |
1640 | * We assume that we have a properly initialized crypto context, | 1649 | * Returns zero on success; non-zero otherwise |
1641 | * pointed to by crypt_stat->tfm. | ||
1642 | * | ||
1643 | * TODO: Implement filename decoding and decryption here, in place of | ||
1644 | * memcpy. We are keeping the framework around for now to (1) | ||
1645 | * facilitate testing of the components needed to implement filename | ||
1646 | * encryption and (2) to provide a code base from which other | ||
1647 | * developers in the community can easily implement this feature. | ||
1648 | * | ||
1649 | * Returns the length of encoded filename; negative if error | ||
1650 | */ | 1650 | */ |
1651 | int | 1651 | static int |
1652 | ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, | 1652 | ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, |
1653 | const char *name, int length, char **encoded_name) | 1653 | struct ecryptfs_crypt_stat *crypt_stat, |
1654 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat) | ||
1654 | { | 1655 | { |
1655 | int error = 0; | 1656 | int rc = 0; |
1656 | 1657 | ||
1657 | (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); | 1658 | filename->encrypted_filename = NULL; |
1658 | if (!(*encoded_name)) { | 1659 | filename->encrypted_filename_size = 0; |
1659 | error = -ENOMEM; | 1660 | if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) |
1661 | || (mount_crypt_stat && (mount_crypt_stat->flags | ||
1662 | & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { | ||
1663 | size_t packet_size; | ||
1664 | size_t remaining_bytes; | ||
1665 | |||
1666 | rc = ecryptfs_write_tag_70_packet( | ||
1667 | NULL, NULL, | ||
1668 | &filename->encrypted_filename_size, | ||
1669 | mount_crypt_stat, NULL, | ||
1670 | filename->filename_size); | ||
1671 | if (rc) { | ||
1672 | printk(KERN_ERR "%s: Error attempting to get packet " | ||
1673 | "size for tag 72; rc = [%d]\n", __func__, | ||
1674 | rc); | ||
1675 | filename->encrypted_filename_size = 0; | ||
1676 | goto out; | ||
1677 | } | ||
1678 | filename->encrypted_filename = | ||
1679 | kmalloc(filename->encrypted_filename_size, GFP_KERNEL); | ||
1680 | if (!filename->encrypted_filename) { | ||
1681 | printk(KERN_ERR "%s: Out of memory whilst attempting " | ||
1682 | "to kmalloc [%zd] bytes\n", __func__, | ||
1683 | filename->encrypted_filename_size); | ||
1684 | rc = -ENOMEM; | ||
1685 | goto out; | ||
1686 | } | ||
1687 | remaining_bytes = filename->encrypted_filename_size; | ||
1688 | rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, | ||
1689 | &remaining_bytes, | ||
1690 | &packet_size, | ||
1691 | mount_crypt_stat, | ||
1692 | filename->filename, | ||
1693 | filename->filename_size); | ||
1694 | if (rc) { | ||
1695 | printk(KERN_ERR "%s: Error attempting to generate " | ||
1696 | "tag 70 packet; rc = [%d]\n", __func__, | ||
1697 | rc); | ||
1698 | kfree(filename->encrypted_filename); | ||
1699 | filename->encrypted_filename = NULL; | ||
1700 | filename->encrypted_filename_size = 0; | ||
1701 | goto out; | ||
1702 | } | ||
1703 | filename->encrypted_filename_size = packet_size; | ||
1704 | } else { | ||
1705 | printk(KERN_ERR "%s: No support for requested filename " | ||
1706 | "encryption method in this release\n", __func__); | ||
1707 | rc = -ENOTSUPP; | ||
1660 | goto out; | 1708 | goto out; |
1661 | } | 1709 | } |
1662 | /* TODO: Filename encryption is a scheduled feature for a | ||
1663 | * future version of eCryptfs. This function is here only for | ||
1664 | * the purpose of providing a framework for other developers | ||
1665 | * to easily implement filename encryption. Hint: Replace this | ||
1666 | * memcpy() with a call to encrypt and encode the | ||
1667 | * filename, the set the length accordingly. */ | ||
1668 | memcpy((void *)(*encoded_name), (void *)name, length); | ||
1669 | (*encoded_name)[length] = '\0'; | ||
1670 | error = length + 1; | ||
1671 | out: | 1710 | out: |
1672 | return error; | 1711 | return rc; |
1673 | } | 1712 | } |
1674 | 1713 | ||
1675 | /** | 1714 | static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, |
1676 | * ecryptfs_decode_filename - converts the cipher text name to plaintext | 1715 | const char *name, size_t name_size) |
1677 | * @crypt_stat: The crypt_stat struct associated with the file | ||
1678 | * @name: The filename in cipher text | ||
1679 | * @length: The length of the cipher text name | ||
1680 | * @decrypted_name: The plaintext name | ||
1681 | * | ||
1682 | * Decodes and decrypts the filename. | ||
1683 | * | ||
1684 | * We assume that we have a properly initialized crypto context, | ||
1685 | * pointed to by crypt_stat->tfm. | ||
1686 | * | ||
1687 | * TODO: Implement filename decoding and decryption here, in place of | ||
1688 | * memcpy. We are keeping the framework around for now to (1) | ||
1689 | * facilitate testing of the components needed to implement filename | ||
1690 | * encryption and (2) to provide a code base from which other | ||
1691 | * developers in the community can easily implement this feature. | ||
1692 | * | ||
1693 | * Returns the length of decoded filename; negative if error | ||
1694 | */ | ||
1695 | int | ||
1696 | ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, | ||
1697 | const char *name, int length, char **decrypted_name) | ||
1698 | { | 1716 | { |
1699 | int error = 0; | 1717 | int rc = 0; |
1700 | 1718 | ||
1701 | (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); | 1719 | (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); |
1702 | if (!(*decrypted_name)) { | 1720 | if (!(*copied_name)) { |
1703 | error = -ENOMEM; | 1721 | rc = -ENOMEM; |
1704 | goto out; | 1722 | goto out; |
1705 | } | 1723 | } |
1706 | /* TODO: Filename encryption is a scheduled feature for a | 1724 | memcpy((void *)(*copied_name), (void *)name, name_size); |
1707 | * future version of eCryptfs. This function is here only for | 1725 | (*copied_name)[(name_size)] = '\0'; /* Only for convenience |
1708 | * the purpose of providing a framework for other developers | ||
1709 | * to easily implement filename encryption. Hint: Replace this | ||
1710 | * memcpy() with a call to decode and decrypt the | ||
1711 | * filename, the set the length accordingly. */ | ||
1712 | memcpy((void *)(*decrypted_name), (void *)name, length); | ||
1713 | (*decrypted_name)[length + 1] = '\0'; /* Only for convenience | ||
1714 | * in printing out the | 1726 | * in printing out the |
1715 | * string in debug | 1727 | * string in debug |
1716 | * messages */ | 1728 | * messages */ |
1717 | error = length; | 1729 | (*copied_name_size) = (name_size + 1); |
1718 | out: | 1730 | out: |
1719 | return error; | 1731 | return rc; |
1720 | } | 1732 | } |
1721 | 1733 | ||
1722 | /** | 1734 | /** |
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, | |||
1740 | *key_tfm = NULL; | 1752 | *key_tfm = NULL; |
1741 | if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { | 1753 | if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { |
1742 | rc = -EINVAL; | 1754 | rc = -EINVAL; |
1743 | printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " | 1755 | printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " |
1744 | "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); | 1756 | "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); |
1745 | goto out; | 1757 | goto out; |
1746 | } | 1758 | } |
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, | |||
1765 | get_random_bytes(dummy_key, *key_size); | 1777 | get_random_bytes(dummy_key, *key_size); |
1766 | rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); | 1778 | rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); |
1767 | if (rc) { | 1779 | if (rc) { |
1768 | printk(KERN_ERR "Error attempting to set key of size [%Zd] for " | 1780 | printk(KERN_ERR "Error attempting to set key of size [%zd] for " |
1769 | "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); | 1781 | "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); |
1770 | rc = -EINVAL; | 1782 | rc = -EINVAL; |
1771 | goto out; | 1783 | goto out; |
@@ -1910,3 +1922,341 @@ out: | |||
1910 | mutex_unlock(&key_tfm_list_mutex); | 1922 | mutex_unlock(&key_tfm_list_mutex); |
1911 | return rc; | 1923 | return rc; |
1912 | } | 1924 | } |
1925 | |||
1926 | /* 64 characters forming a 6-bit target field */ | ||
1927 | static unsigned char *portable_filename_chars = ("-.0123456789ABCD" | ||
1928 | "EFGHIJKLMNOPQRST" | ||
1929 | "UVWXYZabcdefghij" | ||
1930 | "klmnopqrstuvwxyz"); | ||
1931 | |||
1932 | /* We could either offset on every reverse map or just pad some 0x00's | ||
1933 | * at the front here */ | ||
1934 | static const unsigned char filename_rev_map[] = { | ||
1935 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ | ||
1936 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ | ||
1937 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ | ||
1938 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ | ||
1939 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ | ||
1940 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ | ||
1941 | 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ | ||
1942 | 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ | ||
1943 | 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ | ||
1944 | 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ | ||
1945 | 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ | ||
1946 | 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ | ||
1947 | 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ | ||
1948 | 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ | ||
1949 | 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ | ||
1950 | 0x3D, 0x3E, 0x3F | ||
1951 | }; | ||
1952 | |||
1953 | /** | ||
1954 | * ecryptfs_encode_for_filename | ||
1955 | * @dst: Destination location for encoded filename | ||
1956 | * @dst_size: Size of the encoded filename in bytes | ||
1957 | * @src: Source location for the filename to encode | ||
1958 | * @src_size: Size of the source in bytes | ||
1959 | */ | ||
1960 | void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, | ||
1961 | unsigned char *src, size_t src_size) | ||
1962 | { | ||
1963 | size_t num_blocks; | ||
1964 | size_t block_num = 0; | ||
1965 | size_t dst_offset = 0; | ||
1966 | unsigned char last_block[3]; | ||
1967 | |||
1968 | if (src_size == 0) { | ||
1969 | (*dst_size) = 0; | ||
1970 | goto out; | ||
1971 | } | ||
1972 | num_blocks = (src_size / 3); | ||
1973 | if ((src_size % 3) == 0) { | ||
1974 | memcpy(last_block, (&src[src_size - 3]), 3); | ||
1975 | } else { | ||
1976 | num_blocks++; | ||
1977 | last_block[2] = 0x00; | ||
1978 | switch (src_size % 3) { | ||
1979 | case 1: | ||
1980 | last_block[0] = src[src_size - 1]; | ||
1981 | last_block[1] = 0x00; | ||
1982 | break; | ||
1983 | case 2: | ||
1984 | last_block[0] = src[src_size - 2]; | ||
1985 | last_block[1] = src[src_size - 1]; | ||
1986 | } | ||
1987 | } | ||
1988 | (*dst_size) = (num_blocks * 4); | ||
1989 | if (!dst) | ||
1990 | goto out; | ||
1991 | while (block_num < num_blocks) { | ||
1992 | unsigned char *src_block; | ||
1993 | unsigned char dst_block[4]; | ||
1994 | |||
1995 | if (block_num == (num_blocks - 1)) | ||
1996 | src_block = last_block; | ||
1997 | else | ||
1998 | src_block = &src[block_num * 3]; | ||
1999 | dst_block[0] = ((src_block[0] >> 2) & 0x3F); | ||
2000 | dst_block[1] = (((src_block[0] << 4) & 0x30) | ||
2001 | | ((src_block[1] >> 4) & 0x0F)); | ||
2002 | dst_block[2] = (((src_block[1] << 2) & 0x3C) | ||
2003 | | ((src_block[2] >> 6) & 0x03)); | ||
2004 | dst_block[3] = (src_block[2] & 0x3F); | ||
2005 | dst[dst_offset++] = portable_filename_chars[dst_block[0]]; | ||
2006 | dst[dst_offset++] = portable_filename_chars[dst_block[1]]; | ||
2007 | dst[dst_offset++] = portable_filename_chars[dst_block[2]]; | ||
2008 | dst[dst_offset++] = portable_filename_chars[dst_block[3]]; | ||
2009 | block_num++; | ||
2010 | } | ||
2011 | out: | ||
2012 | return; | ||
2013 | } | ||
2014 | |||
2015 | /** | ||
2016 | * ecryptfs_decode_from_filename | ||
2017 | * @dst: If NULL, this function only sets @dst_size and returns. If | ||
2018 | * non-NULL, this function decodes the encoded octets in @src | ||
2019 | * into the memory that @dst points to. | ||
2020 | * @dst_size: Set to the size of the decoded string. | ||
2021 | * @src: The encoded set of octets to decode. | ||
2022 | * @src_size: The size of the encoded set of octets to decode. | ||
2023 | */ | ||
2024 | static void | ||
2025 | ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, | ||
2026 | const unsigned char *src, size_t src_size) | ||
2027 | { | ||
2028 | u8 current_bit_offset = 0; | ||
2029 | size_t src_byte_offset = 0; | ||
2030 | size_t dst_byte_offset = 0; | ||
2031 | |||
2032 | if (dst == NULL) { | ||
2033 | /* Not exact; conservatively long. Every block of 4 | ||
2034 | * encoded characters decodes into a block of 3 | ||
2035 | * decoded characters. This segment of code provides | ||
2036 | * the caller with the maximum amount of allocated | ||
2037 | * space that @dst will need to point to in a | ||
2038 | * subsequent call. */ | ||
2039 | (*dst_size) = (((src_size + 1) * 3) / 4); | ||
2040 | goto out; | ||
2041 | } | ||
2042 | while (src_byte_offset < src_size) { | ||
2043 | unsigned char src_byte = | ||
2044 | filename_rev_map[(int)src[src_byte_offset]]; | ||
2045 | |||
2046 | switch (current_bit_offset) { | ||
2047 | case 0: | ||
2048 | dst[dst_byte_offset] = (src_byte << 2); | ||
2049 | current_bit_offset = 6; | ||
2050 | break; | ||
2051 | case 6: | ||
2052 | dst[dst_byte_offset++] |= (src_byte >> 4); | ||
2053 | dst[dst_byte_offset] = ((src_byte & 0xF) | ||
2054 | << 4); | ||
2055 | current_bit_offset = 4; | ||
2056 | break; | ||
2057 | case 4: | ||
2058 | dst[dst_byte_offset++] |= (src_byte >> 2); | ||
2059 | dst[dst_byte_offset] = (src_byte << 6); | ||
2060 | current_bit_offset = 2; | ||
2061 | break; | ||
2062 | case 2: | ||
2063 | dst[dst_byte_offset++] |= (src_byte); | ||
2064 | dst[dst_byte_offset] = 0; | ||
2065 | current_bit_offset = 0; | ||
2066 | break; | ||
2067 | } | ||
2068 | src_byte_offset++; | ||
2069 | } | ||
2070 | (*dst_size) = dst_byte_offset; | ||
2071 | out: | ||
2072 | return; | ||
2073 | } | ||
2074 | |||
2075 | /** | ||
2076 | * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text | ||
2077 | * @crypt_stat: The crypt_stat struct associated with the file anem to encode | ||
2078 | * @name: The plaintext name | ||
2079 | * @length: The length of the plaintext | ||
2080 | * @encoded_name: The encypted name | ||
2081 | * | ||
2082 | * Encrypts and encodes a filename into something that constitutes a | ||
2083 | * valid filename for a filesystem, with printable characters. | ||
2084 | * | ||
2085 | * We assume that we have a properly initialized crypto context, | ||
2086 | * pointed to by crypt_stat->tfm. | ||
2087 | * | ||
2088 | * Returns zero on success; non-zero on otherwise | ||
2089 | */ | ||
2090 | int ecryptfs_encrypt_and_encode_filename( | ||
2091 | char **encoded_name, | ||
2092 | size_t *encoded_name_size, | ||
2093 | struct ecryptfs_crypt_stat *crypt_stat, | ||
2094 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
2095 | const char *name, size_t name_size) | ||
2096 | { | ||
2097 | size_t encoded_name_no_prefix_size; | ||
2098 | int rc = 0; | ||
2099 | |||
2100 | (*encoded_name) = NULL; | ||
2101 | (*encoded_name_size) = 0; | ||
2102 | if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) | ||
2103 | || (mount_crypt_stat && (mount_crypt_stat->flags | ||
2104 | & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { | ||
2105 | struct ecryptfs_filename *filename; | ||
2106 | |||
2107 | filename = kzalloc(sizeof(*filename), GFP_KERNEL); | ||
2108 | if (!filename) { | ||
2109 | printk(KERN_ERR "%s: Out of memory whilst attempting " | ||
2110 | "to kzalloc [%zd] bytes\n", __func__, | ||
2111 | sizeof(*filename)); | ||
2112 | rc = -ENOMEM; | ||
2113 | goto out; | ||
2114 | } | ||
2115 | filename->filename = (char *)name; | ||
2116 | filename->filename_size = name_size; | ||
2117 | rc = ecryptfs_encrypt_filename(filename, crypt_stat, | ||
2118 | mount_crypt_stat); | ||
2119 | if (rc) { | ||
2120 | printk(KERN_ERR "%s: Error attempting to encrypt " | ||
2121 | "filename; rc = [%d]\n", __func__, rc); | ||
2122 | kfree(filename); | ||
2123 | goto out; | ||
2124 | } | ||
2125 | ecryptfs_encode_for_filename( | ||
2126 | NULL, &encoded_name_no_prefix_size, | ||
2127 | filename->encrypted_filename, | ||
2128 | filename->encrypted_filename_size); | ||
2129 | if ((crypt_stat && (crypt_stat->flags | ||
2130 | & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) | ||
2131 | || (mount_crypt_stat | ||
2132 | && (mount_crypt_stat->flags | ||
2133 | & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) | ||
2134 | (*encoded_name_size) = | ||
2135 | (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE | ||
2136 | + encoded_name_no_prefix_size); | ||
2137 | else | ||
2138 | (*encoded_name_size) = | ||
2139 | (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE | ||
2140 | + encoded_name_no_prefix_size); | ||
2141 | (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); | ||
2142 | if (!(*encoded_name)) { | ||
2143 | printk(KERN_ERR "%s: Out of memory whilst attempting " | ||
2144 | "to kzalloc [%zd] bytes\n", __func__, | ||
2145 | (*encoded_name_size)); | ||
2146 | rc = -ENOMEM; | ||
2147 | kfree(filename->encrypted_filename); | ||
2148 | kfree(filename); | ||
2149 | goto out; | ||
2150 | } | ||
2151 | if ((crypt_stat && (crypt_stat->flags | ||
2152 | & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) | ||
2153 | || (mount_crypt_stat | ||
2154 | && (mount_crypt_stat->flags | ||
2155 | & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { | ||
2156 | memcpy((*encoded_name), | ||
2157 | ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, | ||
2158 | ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); | ||
2159 | ecryptfs_encode_for_filename( | ||
2160 | ((*encoded_name) | ||
2161 | + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), | ||
2162 | &encoded_name_no_prefix_size, | ||
2163 | filename->encrypted_filename, | ||
2164 | filename->encrypted_filename_size); | ||
2165 | (*encoded_name_size) = | ||
2166 | (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE | ||
2167 | + encoded_name_no_prefix_size); | ||
2168 | (*encoded_name)[(*encoded_name_size)] = '\0'; | ||
2169 | (*encoded_name_size)++; | ||
2170 | } else { | ||
2171 | rc = -ENOTSUPP; | ||
2172 | } | ||
2173 | if (rc) { | ||
2174 | printk(KERN_ERR "%s: Error attempting to encode " | ||
2175 | "encrypted filename; rc = [%d]\n", __func__, | ||
2176 | rc); | ||
2177 | kfree((*encoded_name)); | ||
2178 | (*encoded_name) = NULL; | ||
2179 | (*encoded_name_size) = 0; | ||
2180 | } | ||
2181 | kfree(filename->encrypted_filename); | ||
2182 | kfree(filename); | ||
2183 | } else { | ||
2184 | rc = ecryptfs_copy_filename(encoded_name, | ||
2185 | encoded_name_size, | ||
2186 | name, name_size); | ||
2187 | } | ||
2188 | out: | ||
2189 | return rc; | ||
2190 | } | ||
2191 | |||
2192 | /** | ||
2193 | * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext | ||
2194 | * @plaintext_name: The plaintext name | ||
2195 | * @plaintext_name_size: The plaintext name size | ||
2196 | * @ecryptfs_dir_dentry: eCryptfs directory dentry | ||
2197 | * @name: The filename in cipher text | ||
2198 | * @name_size: The cipher text name size | ||
2199 | * | ||
2200 | * Decrypts and decodes the filename. | ||
2201 | * | ||
2202 | * Returns zero on error; non-zero otherwise | ||
2203 | */ | ||
2204 | int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, | ||
2205 | size_t *plaintext_name_size, | ||
2206 | struct dentry *ecryptfs_dir_dentry, | ||
2207 | const char *name, size_t name_size) | ||
2208 | { | ||
2209 | char *decoded_name; | ||
2210 | size_t decoded_name_size; | ||
2211 | size_t packet_size; | ||
2212 | int rc = 0; | ||
2213 | |||
2214 | if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) | ||
2215 | && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, | ||
2216 | ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { | ||
2217 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = | ||
2218 | &ecryptfs_superblock_to_private( | ||
2219 | ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; | ||
2220 | const char *orig_name = name; | ||
2221 | size_t orig_name_size = name_size; | ||
2222 | |||
2223 | name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; | ||
2224 | name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; | ||
2225 | ecryptfs_decode_from_filename(NULL, &decoded_name_size, | ||
2226 | name, name_size); | ||
2227 | decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); | ||
2228 | if (!decoded_name) { | ||
2229 | printk(KERN_ERR "%s: Out of memory whilst attempting " | ||
2230 | "to kmalloc [%zd] bytes\n", __func__, | ||
2231 | decoded_name_size); | ||
2232 | rc = -ENOMEM; | ||
2233 | goto out; | ||
2234 | } | ||
2235 | ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, | ||
2236 | name, name_size); | ||
2237 | rc = ecryptfs_parse_tag_70_packet(plaintext_name, | ||
2238 | plaintext_name_size, | ||
2239 | &packet_size, | ||
2240 | mount_crypt_stat, | ||
2241 | decoded_name, | ||
2242 | decoded_name_size); | ||
2243 | if (rc) { | ||
2244 | printk(KERN_INFO "%s: Could not parse tag 70 packet " | ||
2245 | "from filename; copying through filename " | ||
2246 | "as-is\n", __func__); | ||
2247 | rc = ecryptfs_copy_filename(plaintext_name, | ||
2248 | plaintext_name_size, | ||
2249 | orig_name, orig_name_size); | ||
2250 | goto out_free; | ||
2251 | } | ||
2252 | } else { | ||
2253 | rc = ecryptfs_copy_filename(plaintext_name, | ||
2254 | plaintext_name_size, | ||
2255 | name, name_size); | ||
2256 | goto out; | ||
2257 | } | ||
2258 | out_free: | ||
2259 | kfree(decoded_name); | ||
2260 | out: | ||
2261 | return rc; | ||
2262 | } | ||
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a75026d35d16..c11fc95714ab 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h | |||
@@ -51,12 +51,16 @@ | |||
51 | #define ECRYPTFS_VERSIONING_XATTR 0x00000010 | 51 | #define ECRYPTFS_VERSIONING_XATTR 0x00000010 |
52 | #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 | 52 | #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 |
53 | #define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 | 53 | #define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 |
54 | #define ECRYPTFS_VERSIONING_HMAC 0x00000080 | ||
55 | #define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 | ||
56 | #define ECRYPTFS_VERSIONING_GCM 0x00000200 | ||
54 | #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | 57 | #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ |
55 | | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | 58 | | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ |
56 | | ECRYPTFS_VERSIONING_PUBKEY \ | 59 | | ECRYPTFS_VERSIONING_PUBKEY \ |
57 | | ECRYPTFS_VERSIONING_XATTR \ | 60 | | ECRYPTFS_VERSIONING_XATTR \ |
58 | | ECRYPTFS_VERSIONING_MULTKEY \ | 61 | | ECRYPTFS_VERSIONING_MULTKEY \ |
59 | | ECRYPTFS_VERSIONING_DEVMISC) | 62 | | ECRYPTFS_VERSIONING_DEVMISC \ |
63 | | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) | ||
60 | #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 | 64 | #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 |
61 | #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH | 65 | #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH |
62 | #define ECRYPTFS_SALT_SIZE 8 | 66 | #define ECRYPTFS_SALT_SIZE 8 |
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key) | |||
199 | #define ECRYPTFS_DEFAULT_CIPHER "aes" | 203 | #define ECRYPTFS_DEFAULT_CIPHER "aes" |
200 | #define ECRYPTFS_DEFAULT_KEY_BYTES 16 | 204 | #define ECRYPTFS_DEFAULT_KEY_BYTES 16 |
201 | #define ECRYPTFS_DEFAULT_HASH "md5" | 205 | #define ECRYPTFS_DEFAULT_HASH "md5" |
206 | #define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH | ||
202 | #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 | 207 | #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 |
203 | #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C | 208 | #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C |
204 | #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED | 209 | #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED |
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key) | |||
206 | #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 | 211 | #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 |
207 | #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 | 212 | #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 |
208 | #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 | 213 | #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 |
214 | #define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename | ||
215 | * as dentry name */ | ||
216 | #define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in | ||
217 | * metadata */ | ||
218 | #define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as | ||
219 | * dentry name */ | ||
220 | #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as | ||
221 | * metadata */ | ||
222 | /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= | ||
223 | * ECRYPTFS_MAX_IV_BYTES */ | ||
224 | #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 | ||
225 | #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ | ||
209 | #define MD5_DIGEST_SIZE 16 | 226 | #define MD5_DIGEST_SIZE 16 |
227 | #define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE | ||
228 | #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." | ||
229 | #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 | ||
230 | #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." | ||
231 | #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 | ||
232 | #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) | ||
210 | 233 | ||
211 | struct ecryptfs_key_sig { | 234 | struct ecryptfs_key_sig { |
212 | struct list_head crypt_stat_list; | 235 | struct list_head crypt_stat_list; |
213 | char keysig[ECRYPTFS_SIG_SIZE_HEX]; | 236 | char keysig[ECRYPTFS_SIG_SIZE_HEX]; |
214 | }; | 237 | }; |
215 | 238 | ||
239 | struct ecryptfs_filename { | ||
240 | struct list_head crypt_stat_list; | ||
241 | #define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 | ||
242 | u32 flags; | ||
243 | u32 seq_no; | ||
244 | char *filename; | ||
245 | char *encrypted_filename; | ||
246 | size_t filename_size; | ||
247 | size_t encrypted_filename_size; | ||
248 | char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; | ||
249 | char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; | ||
250 | }; | ||
251 | |||
216 | /** | 252 | /** |
217 | * This is the primary struct associated with each encrypted file. | 253 | * This is the primary struct associated with each encrypted file. |
218 | * | 254 | * |
219 | * TODO: cache align/pack? | 255 | * TODO: cache align/pack? |
220 | */ | 256 | */ |
221 | struct ecryptfs_crypt_stat { | 257 | struct ecryptfs_crypt_stat { |
222 | #define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 | 258 | #define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 |
223 | #define ECRYPTFS_POLICY_APPLIED 0x00000002 | 259 | #define ECRYPTFS_POLICY_APPLIED 0x00000002 |
224 | #define ECRYPTFS_NEW_FILE 0x00000004 | 260 | #define ECRYPTFS_NEW_FILE 0x00000004 |
225 | #define ECRYPTFS_ENCRYPTED 0x00000008 | 261 | #define ECRYPTFS_ENCRYPTED 0x00000008 |
226 | #define ECRYPTFS_SECURITY_WARNING 0x00000010 | 262 | #define ECRYPTFS_SECURITY_WARNING 0x00000010 |
227 | #define ECRYPTFS_ENABLE_HMAC 0x00000020 | 263 | #define ECRYPTFS_ENABLE_HMAC 0x00000020 |
228 | #define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 | 264 | #define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 |
229 | #define ECRYPTFS_KEY_VALID 0x00000080 | 265 | #define ECRYPTFS_KEY_VALID 0x00000080 |
230 | #define ECRYPTFS_METADATA_IN_XATTR 0x00000100 | 266 | #define ECRYPTFS_METADATA_IN_XATTR 0x00000100 |
231 | #define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 | 267 | #define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 |
232 | #define ECRYPTFS_KEY_SET 0x00000400 | 268 | #define ECRYPTFS_KEY_SET 0x00000400 |
269 | #define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 | ||
270 | #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 | ||
271 | #define ECRYPTFS_ENCFN_USE_FEK 0x00002000 | ||
233 | u32 flags; | 272 | u32 flags; |
234 | unsigned int file_version; | 273 | unsigned int file_version; |
235 | size_t iv_bytes; | 274 | size_t iv_bytes; |
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat { | |||
332 | #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 | 371 | #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 |
333 | #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 | 372 | #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 |
334 | #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 | 373 | #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 |
374 | #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 | ||
375 | #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 | ||
376 | #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 | ||
335 | u32 flags; | 377 | u32 flags; |
336 | struct list_head global_auth_tok_list; | 378 | struct list_head global_auth_tok_list; |
337 | struct mutex global_auth_tok_list_mutex; | 379 | struct mutex global_auth_tok_list_mutex; |
338 | size_t num_global_auth_toks; | 380 | size_t num_global_auth_toks; |
339 | size_t global_default_cipher_key_size; | 381 | size_t global_default_cipher_key_size; |
382 | size_t global_default_fn_cipher_key_bytes; | ||
340 | unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE | 383 | unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE |
341 | + 1]; | 384 | + 1]; |
385 | unsigned char global_default_fn_cipher_name[ | ||
386 | ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; | ||
387 | char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; | ||
342 | }; | 388 | }; |
343 | 389 | ||
344 | /* superblock private data. */ | 390 | /* superblock private data. */ |
@@ -571,13 +617,22 @@ struct ecryptfs_open_req { | |||
571 | int ecryptfs_interpose(struct dentry *hidden_dentry, | 617 | int ecryptfs_interpose(struct dentry *hidden_dentry, |
572 | struct dentry *this_dentry, struct super_block *sb, | 618 | struct dentry *this_dentry, struct super_block *sb, |
573 | u32 flags); | 619 | u32 flags); |
620 | int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, | ||
621 | struct dentry *lower_dentry, | ||
622 | struct ecryptfs_crypt_stat *crypt_stat, | ||
623 | struct inode *ecryptfs_dir_inode, | ||
624 | struct nameidata *ecryptfs_nd); | ||
625 | int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, | ||
626 | size_t *decrypted_name_size, | ||
627 | struct dentry *ecryptfs_dentry, | ||
628 | const char *name, size_t name_size); | ||
574 | int ecryptfs_fill_zeros(struct file *file, loff_t new_length); | 629 | int ecryptfs_fill_zeros(struct file *file, loff_t new_length); |
575 | int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, | 630 | int ecryptfs_encrypt_and_encode_filename( |
576 | const char *name, int length, | 631 | char **encoded_name, |
577 | char **decrypted_name); | 632 | size_t *encoded_name_size, |
578 | int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, | 633 | struct ecryptfs_crypt_stat *crypt_stat, |
579 | const char *name, int length, | 634 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, |
580 | char **encoded_name); | 635 | const char *name, size_t name_size); |
581 | struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); | 636 | struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); |
582 | void ecryptfs_dump_hex(char *data, int bytes); | 637 | void ecryptfs_dump_hex(char *data, int bytes); |
583 | int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, | 638 | int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, |
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data, | |||
599 | struct inode *ecryptfs_inode); | 654 | struct inode *ecryptfs_inode); |
600 | int ecryptfs_read_and_validate_xattr_region(char *page_virt, | 655 | int ecryptfs_read_and_validate_xattr_region(char *page_virt, |
601 | struct dentry *ecryptfs_dentry); | 656 | struct dentry *ecryptfs_dentry); |
602 | u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); | 657 | u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); |
603 | int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); | 658 | int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); |
604 | void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); | 659 | void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); |
605 | int ecryptfs_generate_key_packet_set(char *dest_base, | 660 | int ecryptfs_generate_key_packet_set(char *dest_base, |
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file, | |||
694 | struct vfsmount *lower_mnt, | 749 | struct vfsmount *lower_mnt, |
695 | const struct cred *cred); | 750 | const struct cred *cred); |
696 | int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); | 751 | int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); |
752 | int | ||
753 | ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, | ||
754 | size_t *packet_size, | ||
755 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
756 | char *filename, size_t filename_size); | ||
757 | int | ||
758 | ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, | ||
759 | size_t *packet_size, | ||
760 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
761 | char *data, size_t max_packet_size); | ||
762 | int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, | ||
763 | loff_t offset); | ||
697 | 764 | ||
698 | #endif /* #ifndef ECRYPTFS_KERNEL_H */ | 765 | #endif /* #ifndef ECRYPTFS_KERNEL_H */ |
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 713834371229..9e944057001b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c | |||
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback { | |||
77 | 77 | ||
78 | /* Inspired by generic filldir in fs/readdir.c */ | 78 | /* Inspired by generic filldir in fs/readdir.c */ |
79 | static int | 79 | static int |
80 | ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, | 80 | ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, |
81 | u64 ino, unsigned int d_type) | 81 | loff_t offset, u64 ino, unsigned int d_type) |
82 | { | 82 | { |
83 | struct ecryptfs_crypt_stat *crypt_stat; | ||
84 | struct ecryptfs_getdents_callback *buf = | 83 | struct ecryptfs_getdents_callback *buf = |
85 | (struct ecryptfs_getdents_callback *)dirent; | 84 | (struct ecryptfs_getdents_callback *)dirent; |
85 | size_t name_size; | ||
86 | char *name; | ||
86 | int rc; | 87 | int rc; |
87 | int decoded_length; | ||
88 | char *decoded_name; | ||
89 | 88 | ||
90 | crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat; | ||
91 | buf->filldir_called++; | 89 | buf->filldir_called++; |
92 | decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, | 90 | rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, |
93 | &decoded_name); | 91 | buf->dentry, lower_name, |
94 | if (decoded_length < 0) { | 92 | lower_namelen); |
95 | rc = decoded_length; | 93 | if (rc) { |
94 | printk(KERN_ERR "%s: Error attempting to decode and decrypt " | ||
95 | "filename [%s]; rc = [%d]\n", __func__, lower_name, | ||
96 | rc); | ||
96 | goto out; | 97 | goto out; |
97 | } | 98 | } |
98 | rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, | 99 | rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); |
99 | ino, d_type); | 100 | kfree(name); |
100 | kfree(decoded_name); | ||
101 | if (rc >= 0) | 101 | if (rc >= 0) |
102 | buf->entries_written++; | 102 | buf->entries_written++; |
103 | out: | 103 | out: |
@@ -106,8 +106,8 @@ out: | |||
106 | 106 | ||
107 | /** | 107 | /** |
108 | * ecryptfs_readdir | 108 | * ecryptfs_readdir |
109 | * @file: The ecryptfs file struct | 109 | * @file: The eCryptfs directory file |
110 | * @dirent: Directory entry | 110 | * @dirent: Directory entry handle |
111 | * @filldir: The filldir callback function | 111 | * @filldir: The filldir callback function |
112 | */ | 112 | */ |
113 | static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) | 113 | static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) |
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 0111906a8877..5697899a168d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c | |||
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir) | |||
52 | /** | 52 | /** |
53 | * ecryptfs_create_underlying_file | 53 | * ecryptfs_create_underlying_file |
54 | * @lower_dir_inode: inode of the parent in the lower fs of the new file | 54 | * @lower_dir_inode: inode of the parent in the lower fs of the new file |
55 | * @lower_dentry: New file's dentry in the lower fs | 55 | * @dentry: New file's dentry |
56 | * @ecryptfs_dentry: New file's dentry in ecryptfs | ||
57 | * @mode: The mode of the new file | 56 | * @mode: The mode of the new file |
58 | * @nd: nameidata of ecryptfs' parent's dentry & vfsmount | 57 | * @nd: nameidata of ecryptfs' parent's dentry & vfsmount |
59 | * | 58 | * |
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, | |||
228 | { | 227 | { |
229 | int rc; | 228 | int rc; |
230 | 229 | ||
231 | /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens | 230 | /* ecryptfs_do_create() calls ecryptfs_interpose() */ |
232 | * the crypt_stat->lower_file (persistent file) */ | ||
233 | rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); | 231 | rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); |
234 | if (unlikely(rc)) { | 232 | if (unlikely(rc)) { |
235 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" | 233 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" |
@@ -244,141 +242,91 @@ out: | |||
244 | } | 242 | } |
245 | 243 | ||
246 | /** | 244 | /** |
247 | * ecryptfs_lookup | 245 | * ecryptfs_lookup_and_interpose_lower - Perform a lookup |
248 | * @dir: inode | ||
249 | * @dentry: The dentry | ||
250 | * @nd: nameidata, may be NULL | ||
251 | * | ||
252 | * Find a file on disk. If the file does not exist, then we'll add it to the | ||
253 | * dentry cache and continue on to read it from the disk. | ||
254 | */ | 246 | */ |
255 | static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, | 247 | int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, |
256 | struct nameidata *nd) | 248 | struct dentry *lower_dentry, |
249 | struct ecryptfs_crypt_stat *crypt_stat, | ||
250 | struct inode *ecryptfs_dir_inode, | ||
251 | struct nameidata *ecryptfs_nd) | ||
257 | { | 252 | { |
258 | int rc = 0; | ||
259 | struct dentry *lower_dir_dentry; | 253 | struct dentry *lower_dir_dentry; |
260 | struct dentry *lower_dentry; | ||
261 | struct vfsmount *lower_mnt; | 254 | struct vfsmount *lower_mnt; |
262 | char *encoded_name; | 255 | struct inode *lower_inode; |
263 | int encoded_namelen; | ||
264 | struct ecryptfs_crypt_stat *crypt_stat = NULL; | ||
265 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat; | 256 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat; |
266 | char *page_virt = NULL; | 257 | char *page_virt = NULL; |
267 | struct inode *lower_inode; | ||
268 | u64 file_size; | 258 | u64 file_size; |
259 | int rc = 0; | ||
269 | 260 | ||
270 | lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); | 261 | lower_dir_dentry = lower_dentry->d_parent; |
271 | dentry->d_op = &ecryptfs_dops; | 262 | lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( |
272 | if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) | 263 | ecryptfs_dentry->d_parent)); |
273 | || (dentry->d_name.len == 2 | ||
274 | && !strcmp(dentry->d_name.name, ".."))) { | ||
275 | d_drop(dentry); | ||
276 | goto out; | ||
277 | } | ||
278 | encoded_namelen = ecryptfs_encode_filename(crypt_stat, | ||
279 | dentry->d_name.name, | ||
280 | dentry->d_name.len, | ||
281 | &encoded_name); | ||
282 | if (encoded_namelen < 0) { | ||
283 | rc = encoded_namelen; | ||
284 | d_drop(dentry); | ||
285 | goto out; | ||
286 | } | ||
287 | ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen " | ||
288 | "= [%d]\n", encoded_name, encoded_namelen); | ||
289 | lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry, | ||
290 | encoded_namelen - 1); | ||
291 | kfree(encoded_name); | ||
292 | if (IS_ERR(lower_dentry)) { | ||
293 | ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n"); | ||
294 | rc = PTR_ERR(lower_dentry); | ||
295 | d_drop(dentry); | ||
296 | goto out; | ||
297 | } | ||
298 | lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); | ||
299 | ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->" | ||
300 | "d_name.name = [%s]\n", lower_dentry, | ||
301 | lower_dentry->d_name.name); | ||
302 | lower_inode = lower_dentry->d_inode; | 264 | lower_inode = lower_dentry->d_inode; |
303 | fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); | 265 | fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); |
304 | BUG_ON(!atomic_read(&lower_dentry->d_count)); | 266 | BUG_ON(!atomic_read(&lower_dentry->d_count)); |
305 | ecryptfs_set_dentry_private(dentry, | 267 | ecryptfs_set_dentry_private(ecryptfs_dentry, |
306 | kmem_cache_alloc(ecryptfs_dentry_info_cache, | 268 | kmem_cache_alloc(ecryptfs_dentry_info_cache, |
307 | GFP_KERNEL)); | 269 | GFP_KERNEL)); |
308 | if (!ecryptfs_dentry_to_private(dentry)) { | 270 | if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { |
309 | rc = -ENOMEM; | 271 | rc = -ENOMEM; |
310 | ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " | 272 | printk(KERN_ERR "%s: Out of memory whilst attempting " |
311 | "to allocate ecryptfs_dentry_info struct\n"); | 273 | "to allocate ecryptfs_dentry_info struct\n", |
274 | __func__); | ||
312 | goto out_dput; | 275 | goto out_dput; |
313 | } | 276 | } |
314 | ecryptfs_set_dentry_lower(dentry, lower_dentry); | 277 | ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); |
315 | ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); | 278 | ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); |
316 | if (!lower_dentry->d_inode) { | 279 | if (!lower_dentry->d_inode) { |
317 | /* We want to add because we couldn't find in lower */ | 280 | /* We want to add because we couldn't find in lower */ |
318 | d_add(dentry, NULL); | 281 | d_add(ecryptfs_dentry, NULL); |
319 | goto out; | 282 | goto out; |
320 | } | 283 | } |
321 | rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, | 284 | rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, |
322 | ECRYPTFS_INTERPOSE_FLAG_D_ADD); | 285 | ecryptfs_dir_inode->i_sb, 1); |
323 | if (rc) { | 286 | if (rc) { |
324 | ecryptfs_printk(KERN_ERR, "Error interposing\n"); | 287 | printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", |
288 | __func__, rc); | ||
325 | goto out; | 289 | goto out; |
326 | } | 290 | } |
327 | if (S_ISDIR(lower_inode->i_mode)) { | 291 | if (S_ISDIR(lower_inode->i_mode)) |
328 | ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); | ||
329 | goto out; | 292 | goto out; |
330 | } | 293 | if (S_ISLNK(lower_inode->i_mode)) |
331 | if (S_ISLNK(lower_inode->i_mode)) { | ||
332 | ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n"); | ||
333 | goto out; | 294 | goto out; |
334 | } | 295 | if (special_file(lower_inode->i_mode)) |
335 | if (special_file(lower_inode->i_mode)) { | ||
336 | ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n"); | ||
337 | goto out; | 296 | goto out; |
338 | } | 297 | if (!ecryptfs_nd) |
339 | if (!nd) { | ||
340 | ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave" | ||
341 | "as we *think* we are about to unlink\n"); | ||
342 | goto out; | 298 | goto out; |
343 | } | ||
344 | /* Released in this function */ | 299 | /* Released in this function */ |
345 | page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, | 300 | page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); |
346 | GFP_USER); | ||
347 | if (!page_virt) { | 301 | if (!page_virt) { |
302 | printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n", | ||
303 | __func__); | ||
348 | rc = -ENOMEM; | 304 | rc = -ENOMEM; |
349 | ecryptfs_printk(KERN_ERR, | ||
350 | "Cannot ecryptfs_kmalloc a page\n"); | ||
351 | goto out; | 305 | goto out; |
352 | } | 306 | } |
353 | crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; | 307 | if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { |
354 | if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) | 308 | rc = ecryptfs_init_persistent_file(ecryptfs_dentry); |
355 | ecryptfs_set_default_sizes(crypt_stat); | ||
356 | if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { | ||
357 | rc = ecryptfs_init_persistent_file(dentry); | ||
358 | if (rc) { | 309 | if (rc) { |
359 | printk(KERN_ERR "%s: Error attempting to initialize " | 310 | printk(KERN_ERR "%s: Error attempting to initialize " |
360 | "the persistent file for the dentry with name " | 311 | "the persistent file for the dentry with name " |
361 | "[%s]; rc = [%d]\n", __func__, | 312 | "[%s]; rc = [%d]\n", __func__, |
362 | dentry->d_name.name, rc); | 313 | ecryptfs_dentry->d_name.name, rc); |
363 | goto out; | 314 | goto out_free_kmem; |
364 | } | 315 | } |
365 | } | 316 | } |
366 | rc = ecryptfs_read_and_validate_header_region(page_virt, | 317 | rc = ecryptfs_read_and_validate_header_region(page_virt, |
367 | dentry->d_inode); | 318 | ecryptfs_dentry->d_inode); |
368 | if (rc) { | 319 | if (rc) { |
369 | rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); | 320 | rc = ecryptfs_read_and_validate_xattr_region(page_virt, |
321 | ecryptfs_dentry); | ||
370 | if (rc) { | 322 | if (rc) { |
371 | printk(KERN_DEBUG "Valid metadata not found in header " | ||
372 | "region or xattr region; treating file as " | ||
373 | "unencrypted\n"); | ||
374 | rc = 0; | 323 | rc = 0; |
375 | kmem_cache_free(ecryptfs_header_cache_2, page_virt); | 324 | goto out_free_kmem; |
376 | goto out; | ||
377 | } | 325 | } |
378 | crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; | 326 | crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; |
379 | } | 327 | } |
380 | mount_crypt_stat = &ecryptfs_superblock_to_private( | 328 | mount_crypt_stat = &ecryptfs_superblock_to_private( |
381 | dentry->d_sb)->mount_crypt_stat; | 329 | ecryptfs_dentry->d_sb)->mount_crypt_stat; |
382 | if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { | 330 | if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { |
383 | if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) | 331 | if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) |
384 | file_size = (crypt_stat->num_header_bytes_at_front | 332 | file_size = (crypt_stat->num_header_bytes_at_front |
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, | |||
388 | } else { | 336 | } else { |
389 | file_size = get_unaligned_be64(page_virt); | 337 | file_size = get_unaligned_be64(page_virt); |
390 | } | 338 | } |
391 | i_size_write(dentry->d_inode, (loff_t)file_size); | 339 | i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size); |
340 | out_free_kmem: | ||
392 | kmem_cache_free(ecryptfs_header_cache_2, page_virt); | 341 | kmem_cache_free(ecryptfs_header_cache_2, page_virt); |
393 | goto out; | 342 | goto out; |
394 | |||
395 | out_dput: | 343 | out_dput: |
396 | dput(lower_dentry); | 344 | dput(lower_dentry); |
397 | d_drop(dentry); | 345 | d_drop(ecryptfs_dentry); |
346 | out: | ||
347 | return rc; | ||
348 | } | ||
349 | |||
350 | /** | ||
351 | * ecryptfs_lookup | ||
352 | * @ecryptfs_dir_inode: The eCryptfs directory inode | ||
353 | * @ecryptfs_dentry: The eCryptfs dentry that we are looking up | ||
354 | * @ecryptfs_nd: nameidata; may be NULL | ||
355 | * | ||
356 | * Find a file on disk. If the file does not exist, then we'll add it to the | ||
357 | * dentry cache and continue on to read it from the disk. | ||
358 | */ | ||
359 | static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, | ||
360 | struct dentry *ecryptfs_dentry, | ||
361 | struct nameidata *ecryptfs_nd) | ||
362 | { | ||
363 | char *encrypted_and_encoded_name = NULL; | ||
364 | size_t encrypted_and_encoded_name_size; | ||
365 | struct ecryptfs_crypt_stat *crypt_stat = NULL; | ||
366 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; | ||
367 | struct ecryptfs_inode_info *inode_info; | ||
368 | struct dentry *lower_dir_dentry, *lower_dentry; | ||
369 | int rc = 0; | ||
370 | |||
371 | ecryptfs_dentry->d_op = &ecryptfs_dops; | ||
372 | if ((ecryptfs_dentry->d_name.len == 1 | ||
373 | && !strcmp(ecryptfs_dentry->d_name.name, ".")) | ||
374 | || (ecryptfs_dentry->d_name.len == 2 | ||
375 | && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { | ||
376 | goto out_d_drop; | ||
377 | } | ||
378 | lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); | ||
379 | lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, | ||
380 | lower_dir_dentry, | ||
381 | ecryptfs_dentry->d_name.len); | ||
382 | if (IS_ERR(lower_dentry)) { | ||
383 | rc = PTR_ERR(lower_dentry); | ||
384 | printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " | ||
385 | "lower_dentry = [%s]\n", __func__, rc, | ||
386 | ecryptfs_dentry->d_name.name); | ||
387 | goto out_d_drop; | ||
388 | } | ||
389 | if (lower_dentry->d_inode) | ||
390 | goto lookup_and_interpose; | ||
391 | inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); | ||
392 | if (inode_info) { | ||
393 | crypt_stat = &inode_info->crypt_stat; | ||
394 | /* TODO: lock for crypt_stat comparison */ | ||
395 | if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) | ||
396 | ecryptfs_set_default_sizes(crypt_stat); | ||
397 | } | ||
398 | if (crypt_stat) | ||
399 | mount_crypt_stat = crypt_stat->mount_crypt_stat; | ||
400 | else | ||
401 | mount_crypt_stat = &ecryptfs_superblock_to_private( | ||
402 | ecryptfs_dentry->d_sb)->mount_crypt_stat; | ||
403 | if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) | ||
404 | && !(mount_crypt_stat && (mount_crypt_stat->flags | ||
405 | & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) | ||
406 | goto lookup_and_interpose; | ||
407 | dput(lower_dentry); | ||
408 | rc = ecryptfs_encrypt_and_encode_filename( | ||
409 | &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, | ||
410 | crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, | ||
411 | ecryptfs_dentry->d_name.len); | ||
412 | if (rc) { | ||
413 | printk(KERN_ERR "%s: Error attempting to encrypt and encode " | ||
414 | "filename; rc = [%d]\n", __func__, rc); | ||
415 | goto out_d_drop; | ||
416 | } | ||
417 | lower_dentry = lookup_one_len(encrypted_and_encoded_name, | ||
418 | lower_dir_dentry, | ||
419 | encrypted_and_encoded_name_size - 1); | ||
420 | if (IS_ERR(lower_dentry)) { | ||
421 | rc = PTR_ERR(lower_dentry); | ||
422 | printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " | ||
423 | "lower_dentry = [%s]\n", __func__, rc, | ||
424 | encrypted_and_encoded_name); | ||
425 | goto out_d_drop; | ||
426 | } | ||
427 | lookup_and_interpose: | ||
428 | rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, | ||
429 | crypt_stat, ecryptfs_dir_inode, | ||
430 | ecryptfs_nd); | ||
431 | goto out; | ||
432 | out_d_drop: | ||
433 | d_drop(ecryptfs_dentry); | ||
398 | out: | 434 | out: |
435 | kfree(encrypted_and_encoded_name); | ||
399 | return ERR_PTR(rc); | 436 | return ERR_PTR(rc); |
400 | } | 437 | } |
401 | 438 | ||
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, | |||
466 | struct dentry *lower_dentry; | 503 | struct dentry *lower_dentry; |
467 | struct dentry *lower_dir_dentry; | 504 | struct dentry *lower_dir_dentry; |
468 | char *encoded_symname; | 505 | char *encoded_symname; |
469 | int encoded_symlen; | 506 | size_t encoded_symlen; |
470 | struct ecryptfs_crypt_stat *crypt_stat = NULL; | 507 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; |
471 | 508 | ||
472 | lower_dentry = ecryptfs_dentry_to_lower(dentry); | 509 | lower_dentry = ecryptfs_dentry_to_lower(dentry); |
473 | dget(lower_dentry); | 510 | dget(lower_dentry); |
474 | lower_dir_dentry = lock_parent(lower_dentry); | 511 | lower_dir_dentry = lock_parent(lower_dentry); |
475 | encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, | 512 | mount_crypt_stat = &ecryptfs_superblock_to_private( |
476 | strlen(symname), | 513 | dir->i_sb)->mount_crypt_stat; |
477 | &encoded_symname); | 514 | rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, |
478 | if (encoded_symlen < 0) { | 515 | &encoded_symlen, |
479 | rc = encoded_symlen; | 516 | NULL, |
517 | mount_crypt_stat, symname, | ||
518 | strlen(symname)); | ||
519 | if (rc) | ||
480 | goto out_lock; | 520 | goto out_lock; |
481 | } | ||
482 | rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, | 521 | rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, |
483 | encoded_symname); | 522 | encoded_symname); |
484 | kfree(encoded_symname); | 523 | kfree(encoded_symname); |
@@ -602,52 +641,54 @@ out_lock: | |||
602 | } | 641 | } |
603 | 642 | ||
604 | static int | 643 | static int |
605 | ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) | 644 | ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) |
606 | { | 645 | { |
607 | int rc; | ||
608 | struct dentry *lower_dentry; | ||
609 | char *decoded_name; | ||
610 | char *lower_buf; | 646 | char *lower_buf; |
611 | mm_segment_t old_fs; | 647 | struct dentry *lower_dentry; |
612 | struct ecryptfs_crypt_stat *crypt_stat; | 648 | struct ecryptfs_crypt_stat *crypt_stat; |
649 | char *plaintext_name; | ||
650 | size_t plaintext_name_size; | ||
651 | mm_segment_t old_fs; | ||
652 | int rc; | ||
613 | 653 | ||
614 | lower_dentry = ecryptfs_dentry_to_lower(dentry); | 654 | lower_dentry = ecryptfs_dentry_to_lower(dentry); |
615 | if (!lower_dentry->d_inode->i_op->readlink) { | 655 | if (!lower_dentry->d_inode->i_op->readlink) { |
616 | rc = -EINVAL; | 656 | rc = -EINVAL; |
617 | goto out; | 657 | goto out; |
618 | } | 658 | } |
659 | crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; | ||
619 | /* Released in this function */ | 660 | /* Released in this function */ |
620 | lower_buf = kmalloc(bufsiz, GFP_KERNEL); | 661 | lower_buf = kmalloc(bufsiz, GFP_KERNEL); |
621 | if (lower_buf == NULL) { | 662 | if (lower_buf == NULL) { |
622 | ecryptfs_printk(KERN_ERR, "Out of memory\n"); | 663 | printk(KERN_ERR "%s: Out of memory whilst attempting to " |
664 | "kmalloc [%d] bytes\n", __func__, bufsiz); | ||
623 | rc = -ENOMEM; | 665 | rc = -ENOMEM; |
624 | goto out; | 666 | goto out; |
625 | } | 667 | } |
626 | old_fs = get_fs(); | 668 | old_fs = get_fs(); |
627 | set_fs(get_ds()); | 669 | set_fs(get_ds()); |
628 | ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " | ||
629 | "lower_dentry->d_name.name = [%s]\n", | ||
630 | lower_dentry->d_name.name); | ||
631 | rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, | 670 | rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, |
632 | (char __user *)lower_buf, | 671 | (char __user *)lower_buf, |
633 | bufsiz); | 672 | bufsiz); |
634 | set_fs(old_fs); | 673 | set_fs(old_fs); |
635 | if (rc >= 0) { | 674 | if (rc >= 0) { |
636 | crypt_stat = NULL; | 675 | rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, |
637 | rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, | 676 | &plaintext_name_size, |
638 | &decoded_name); | 677 | dentry, lower_buf, |
639 | if (rc == -ENOMEM) | 678 | rc); |
679 | if (rc) { | ||
680 | printk(KERN_ERR "%s: Error attempting to decode and " | ||
681 | "decrypt filename; rc = [%d]\n", __func__, | ||
682 | rc); | ||
640 | goto out_free_lower_buf; | 683 | goto out_free_lower_buf; |
641 | if (rc > 0) { | ||
642 | ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes " | ||
643 | "to userspace: [%*s]\n", rc, | ||
644 | decoded_name); | ||
645 | if (copy_to_user(buf, decoded_name, rc)) | ||
646 | rc = -EFAULT; | ||
647 | } | 684 | } |
648 | kfree(decoded_name); | 685 | rc = copy_to_user(buf, plaintext_name, plaintext_name_size); |
649 | fsstack_copy_attr_atime(dentry->d_inode, | 686 | if (rc) |
650 | lower_dentry->d_inode); | 687 | rc = -EFAULT; |
688 | else | ||
689 | rc = plaintext_name_size; | ||
690 | kfree(plaintext_name); | ||
691 | fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); | ||
651 | } | 692 | } |
652 | out_free_lower_buf: | 693 | out_free_lower_buf: |
653 | kfree(lower_buf); | 694 | kfree(lower_buf); |
@@ -669,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) | |||
669 | } | 710 | } |
670 | old_fs = get_fs(); | 711 | old_fs = get_fs(); |
671 | set_fs(get_ds()); | 712 | set_fs(get_ds()); |
672 | ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " | ||
673 | "dentry->d_name.name = [%s]\n", dentry->d_name.name); | ||
674 | rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); | 713 | rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); |
675 | set_fs(old_fs); | 714 | set_fs(old_fs); |
676 | if (rc < 0) | 715 | if (rc < 0) |
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 0d713b691941..ff539420cc6f 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c | |||
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, | |||
358 | /* verify that everything through the encrypted FEK size is present */ | 358 | /* verify that everything through the encrypted FEK size is present */ |
359 | if (message_len < 4) { | 359 | if (message_len < 4) { |
360 | rc = -EIO; | 360 | rc = -EIO; |
361 | printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " | 361 | printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " |
362 | "message length is [%d]\n", __func__, message_len, 4); | 362 | "message length is [%d]\n", __func__, message_len, 4); |
363 | goto out; | 363 | goto out; |
364 | } | 364 | } |
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, | |||
385 | i += data_len; | 385 | i += data_len; |
386 | if (message_len < (i + key_rec->enc_key_size)) { | 386 | if (message_len < (i + key_rec->enc_key_size)) { |
387 | rc = -EIO; | 387 | rc = -EIO; |
388 | printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", | 388 | printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", |
389 | __func__, message_len, (i + key_rec->enc_key_size)); | 389 | __func__, message_len, (i + key_rec->enc_key_size)); |
390 | goto out; | 390 | goto out; |
391 | } | 391 | } |
392 | if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { | 392 | if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { |
393 | rc = -EIO; | 393 | rc = -EIO; |
394 | printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " | 394 | printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " |
395 | "the maximum key size [%d]\n", __func__, | 395 | "the maximum key size [%d]\n", __func__, |
396 | key_rec->enc_key_size, | 396 | key_rec->enc_key_size, |
397 | ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); | 397 | ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); |
@@ -403,6 +403,580 @@ out: | |||
403 | } | 403 | } |
404 | 404 | ||
405 | static int | 405 | static int |
406 | ecryptfs_find_global_auth_tok_for_sig( | ||
407 | struct ecryptfs_global_auth_tok **global_auth_tok, | ||
408 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) | ||
409 | { | ||
410 | struct ecryptfs_global_auth_tok *walker; | ||
411 | int rc = 0; | ||
412 | |||
413 | (*global_auth_tok) = NULL; | ||
414 | mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); | ||
415 | list_for_each_entry(walker, | ||
416 | &mount_crypt_stat->global_auth_tok_list, | ||
417 | mount_crypt_stat_list) { | ||
418 | if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { | ||
419 | (*global_auth_tok) = walker; | ||
420 | goto out; | ||
421 | } | ||
422 | } | ||
423 | rc = -EINVAL; | ||
424 | out: | ||
425 | mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); | ||
426 | return rc; | ||
427 | } | ||
428 | |||
429 | /** | ||
430 | * ecryptfs_find_auth_tok_for_sig | ||
431 | * @auth_tok: Set to the matching auth_tok; NULL if not found | ||
432 | * @crypt_stat: inode crypt_stat crypto context | ||
433 | * @sig: Sig of auth_tok to find | ||
434 | * | ||
435 | * For now, this function simply looks at the registered auth_tok's | ||
436 | * linked off the mount_crypt_stat, so all the auth_toks that can be | ||
437 | * used must be registered at mount time. This function could | ||
438 | * potentially try a lot harder to find auth_tok's (e.g., by calling | ||
439 | * out to ecryptfsd to dynamically retrieve an auth_tok object) so | ||
440 | * that static registration of auth_tok's will no longer be necessary. | ||
441 | * | ||
442 | * Returns zero on no error; non-zero on error | ||
443 | */ | ||
444 | static int | ||
445 | ecryptfs_find_auth_tok_for_sig( | ||
446 | struct ecryptfs_auth_tok **auth_tok, | ||
447 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
448 | char *sig) | ||
449 | { | ||
450 | struct ecryptfs_global_auth_tok *global_auth_tok; | ||
451 | int rc = 0; | ||
452 | |||
453 | (*auth_tok) = NULL; | ||
454 | if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, | ||
455 | mount_crypt_stat, sig)) { | ||
456 | struct key *auth_tok_key; | ||
457 | |||
458 | rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, | ||
459 | sig); | ||
460 | } else | ||
461 | (*auth_tok) = global_auth_tok->global_auth_tok; | ||
462 | return rc; | ||
463 | } | ||
464 | |||
465 | /** | ||
466 | * write_tag_70_packet can gobble a lot of stack space. We stuff most | ||
467 | * of the function's parameters in a kmalloc'd struct to help reduce | ||
468 | * eCryptfs' overall stack usage. | ||
469 | */ | ||
470 | struct ecryptfs_write_tag_70_packet_silly_stack { | ||
471 | u8 cipher_code; | ||
472 | size_t max_packet_size; | ||
473 | size_t packet_size_len; | ||
474 | size_t block_aligned_filename_size; | ||
475 | size_t block_size; | ||
476 | size_t i; | ||
477 | size_t j; | ||
478 | size_t num_rand_bytes; | ||
479 | struct mutex *tfm_mutex; | ||
480 | char *block_aligned_filename; | ||
481 | struct ecryptfs_auth_tok *auth_tok; | ||
482 | struct scatterlist src_sg; | ||
483 | struct scatterlist dst_sg; | ||
484 | struct blkcipher_desc desc; | ||
485 | char iv[ECRYPTFS_MAX_IV_BYTES]; | ||
486 | char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; | ||
487 | char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; | ||
488 | struct hash_desc hash_desc; | ||
489 | struct scatterlist hash_sg; | ||
490 | }; | ||
491 | |||
492 | /** | ||
493 | * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK | ||
494 | * @filename: NULL-terminated filename string | ||
495 | * | ||
496 | * This is the simplest mechanism for achieving filename encryption in | ||
497 | * eCryptfs. It encrypts the given filename with the mount-wide | ||
498 | * filename encryption key (FNEK) and stores it in a packet to @dest, | ||
499 | * which the callee will encode and write directly into the dentry | ||
500 | * name. | ||
501 | */ | ||
502 | int | ||
503 | ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, | ||
504 | size_t *packet_size, | ||
505 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
506 | char *filename, size_t filename_size) | ||
507 | { | ||
508 | struct ecryptfs_write_tag_70_packet_silly_stack *s; | ||
509 | int rc = 0; | ||
510 | |||
511 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
512 | if (!s) { | ||
513 | printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " | ||
514 | "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); | ||
515 | goto out; | ||
516 | } | ||
517 | s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
518 | (*packet_size) = 0; | ||
519 | rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( | ||
520 | &s->desc.tfm, | ||
521 | &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); | ||
522 | if (unlikely(rc)) { | ||
523 | printk(KERN_ERR "Internal error whilst attempting to get " | ||
524 | "tfm and mutex for cipher name [%s]; rc = [%d]\n", | ||
525 | mount_crypt_stat->global_default_fn_cipher_name, rc); | ||
526 | goto out; | ||
527 | } | ||
528 | mutex_lock(s->tfm_mutex); | ||
529 | s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); | ||
530 | /* Plus one for the \0 separator between the random prefix | ||
531 | * and the plaintext filename */ | ||
532 | s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); | ||
533 | s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); | ||
534 | if ((s->block_aligned_filename_size % s->block_size) != 0) { | ||
535 | s->num_rand_bytes += (s->block_size | ||
536 | - (s->block_aligned_filename_size | ||
537 | % s->block_size)); | ||
538 | s->block_aligned_filename_size = (s->num_rand_bytes | ||
539 | + filename_size); | ||
540 | } | ||
541 | /* Octet 0: Tag 70 identifier | ||
542 | * Octets 1-N1: Tag 70 packet size (includes cipher identifier | ||
543 | * and block-aligned encrypted filename size) | ||
544 | * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) | ||
545 | * Octet N2-N3: Cipher identifier (1 octet) | ||
546 | * Octets N3-N4: Block-aligned encrypted filename | ||
547 | * - Consists of a minimum number of random characters, a \0 | ||
548 | * separator, and then the filename */ | ||
549 | s->max_packet_size = (1 /* Tag 70 identifier */ | ||
550 | + 3 /* Max Tag 70 packet size */ | ||
551 | + ECRYPTFS_SIG_SIZE /* FNEK sig */ | ||
552 | + 1 /* Cipher identifier */ | ||
553 | + s->block_aligned_filename_size); | ||
554 | if (dest == NULL) { | ||
555 | (*packet_size) = s->max_packet_size; | ||
556 | goto out_unlock; | ||
557 | } | ||
558 | if (s->max_packet_size > (*remaining_bytes)) { | ||
559 | printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " | ||
560 | "[%zd] available\n", __func__, s->max_packet_size, | ||
561 | (*remaining_bytes)); | ||
562 | rc = -EINVAL; | ||
563 | goto out_unlock; | ||
564 | } | ||
565 | s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, | ||
566 | GFP_KERNEL); | ||
567 | if (!s->block_aligned_filename) { | ||
568 | printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " | ||
569 | "kzalloc [%zd] bytes\n", __func__, | ||
570 | s->block_aligned_filename_size); | ||
571 | rc = -ENOMEM; | ||
572 | goto out_unlock; | ||
573 | } | ||
574 | s->i = 0; | ||
575 | dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; | ||
576 | rc = ecryptfs_write_packet_length(&dest[s->i], | ||
577 | (ECRYPTFS_SIG_SIZE | ||
578 | + 1 /* Cipher code */ | ||
579 | + s->block_aligned_filename_size), | ||
580 | &s->packet_size_len); | ||
581 | if (rc) { | ||
582 | printk(KERN_ERR "%s: Error generating tag 70 packet " | ||
583 | "header; cannot generate packet length; rc = [%d]\n", | ||
584 | __func__, rc); | ||
585 | goto out_free_unlock; | ||
586 | } | ||
587 | s->i += s->packet_size_len; | ||
588 | ecryptfs_from_hex(&dest[s->i], | ||
589 | mount_crypt_stat->global_default_fnek_sig, | ||
590 | ECRYPTFS_SIG_SIZE); | ||
591 | s->i += ECRYPTFS_SIG_SIZE; | ||
592 | s->cipher_code = ecryptfs_code_for_cipher_string( | ||
593 | mount_crypt_stat->global_default_fn_cipher_name, | ||
594 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
595 | if (s->cipher_code == 0) { | ||
596 | printk(KERN_WARNING "%s: Unable to generate code for " | ||
597 | "cipher [%s] with key bytes [%zd]\n", __func__, | ||
598 | mount_crypt_stat->global_default_fn_cipher_name, | ||
599 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
600 | rc = -EINVAL; | ||
601 | goto out_free_unlock; | ||
602 | } | ||
603 | dest[s->i++] = s->cipher_code; | ||
604 | rc = ecryptfs_find_auth_tok_for_sig( | ||
605 | &s->auth_tok, mount_crypt_stat, | ||
606 | mount_crypt_stat->global_default_fnek_sig); | ||
607 | if (rc) { | ||
608 | printk(KERN_ERR "%s: Error attempting to find auth tok for " | ||
609 | "fnek sig [%s]; rc = [%d]\n", __func__, | ||
610 | mount_crypt_stat->global_default_fnek_sig, rc); | ||
611 | goto out_free_unlock; | ||
612 | } | ||
613 | /* TODO: Support other key modules than passphrase for | ||
614 | * filename encryption */ | ||
615 | BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); | ||
616 | sg_init_one( | ||
617 | &s->hash_sg, | ||
618 | (u8 *)s->auth_tok->token.password.session_key_encryption_key, | ||
619 | s->auth_tok->token.password.session_key_encryption_key_bytes); | ||
620 | s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
621 | s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, | ||
622 | CRYPTO_ALG_ASYNC); | ||
623 | if (IS_ERR(s->hash_desc.tfm)) { | ||
624 | rc = PTR_ERR(s->hash_desc.tfm); | ||
625 | printk(KERN_ERR "%s: Error attempting to " | ||
626 | "allocate hash crypto context; rc = [%d]\n", | ||
627 | __func__, rc); | ||
628 | goto out_free_unlock; | ||
629 | } | ||
630 | rc = crypto_hash_init(&s->hash_desc); | ||
631 | if (rc) { | ||
632 | printk(KERN_ERR | ||
633 | "%s: Error initializing crypto hash; rc = [%d]\n", | ||
634 | __func__, rc); | ||
635 | goto out_release_free_unlock; | ||
636 | } | ||
637 | rc = crypto_hash_update( | ||
638 | &s->hash_desc, &s->hash_sg, | ||
639 | s->auth_tok->token.password.session_key_encryption_key_bytes); | ||
640 | if (rc) { | ||
641 | printk(KERN_ERR | ||
642 | "%s: Error updating crypto hash; rc = [%d]\n", | ||
643 | __func__, rc); | ||
644 | goto out_release_free_unlock; | ||
645 | } | ||
646 | rc = crypto_hash_final(&s->hash_desc, s->hash); | ||
647 | if (rc) { | ||
648 | printk(KERN_ERR | ||
649 | "%s: Error finalizing crypto hash; rc = [%d]\n", | ||
650 | __func__, rc); | ||
651 | goto out_release_free_unlock; | ||
652 | } | ||
653 | for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { | ||
654 | s->block_aligned_filename[s->j] = | ||
655 | s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; | ||
656 | if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) | ||
657 | == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { | ||
658 | sg_init_one(&s->hash_sg, (u8 *)s->hash, | ||
659 | ECRYPTFS_TAG_70_DIGEST_SIZE); | ||
660 | rc = crypto_hash_init(&s->hash_desc); | ||
661 | if (rc) { | ||
662 | printk(KERN_ERR | ||
663 | "%s: Error initializing crypto hash; " | ||
664 | "rc = [%d]\n", __func__, rc); | ||
665 | goto out_release_free_unlock; | ||
666 | } | ||
667 | rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, | ||
668 | ECRYPTFS_TAG_70_DIGEST_SIZE); | ||
669 | if (rc) { | ||
670 | printk(KERN_ERR | ||
671 | "%s: Error updating crypto hash; " | ||
672 | "rc = [%d]\n", __func__, rc); | ||
673 | goto out_release_free_unlock; | ||
674 | } | ||
675 | rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); | ||
676 | if (rc) { | ||
677 | printk(KERN_ERR | ||
678 | "%s: Error finalizing crypto hash; " | ||
679 | "rc = [%d]\n", __func__, rc); | ||
680 | goto out_release_free_unlock; | ||
681 | } | ||
682 | memcpy(s->hash, s->tmp_hash, | ||
683 | ECRYPTFS_TAG_70_DIGEST_SIZE); | ||
684 | } | ||
685 | if (s->block_aligned_filename[s->j] == '\0') | ||
686 | s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; | ||
687 | } | ||
688 | memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, | ||
689 | filename_size); | ||
690 | rc = virt_to_scatterlist(s->block_aligned_filename, | ||
691 | s->block_aligned_filename_size, &s->src_sg, 1); | ||
692 | if (rc != 1) { | ||
693 | printk(KERN_ERR "%s: Internal error whilst attempting to " | ||
694 | "convert filename memory to scatterlist; " | ||
695 | "expected rc = 1; got rc = [%d]. " | ||
696 | "block_aligned_filename_size = [%zd]\n", __func__, rc, | ||
697 | s->block_aligned_filename_size); | ||
698 | goto out_release_free_unlock; | ||
699 | } | ||
700 | rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, | ||
701 | &s->dst_sg, 1); | ||
702 | if (rc != 1) { | ||
703 | printk(KERN_ERR "%s: Internal error whilst attempting to " | ||
704 | "convert encrypted filename memory to scatterlist; " | ||
705 | "expected rc = 1; got rc = [%d]. " | ||
706 | "block_aligned_filename_size = [%zd]\n", __func__, rc, | ||
707 | s->block_aligned_filename_size); | ||
708 | goto out_release_free_unlock; | ||
709 | } | ||
710 | /* The characters in the first block effectively do the job | ||
711 | * of the IV here, so we just use 0's for the IV. Note the | ||
712 | * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES | ||
713 | * >= ECRYPTFS_MAX_IV_BYTES. */ | ||
714 | memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); | ||
715 | s->desc.info = s->iv; | ||
716 | rc = crypto_blkcipher_setkey( | ||
717 | s->desc.tfm, | ||
718 | s->auth_tok->token.password.session_key_encryption_key, | ||
719 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
720 | if (rc < 0) { | ||
721 | printk(KERN_ERR "%s: Error setting key for crypto context; " | ||
722 | "rc = [%d]. s->auth_tok->token.password.session_key_" | ||
723 | "encryption_key = [0x%p]; mount_crypt_stat->" | ||
724 | "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, | ||
725 | rc, | ||
726 | s->auth_tok->token.password.session_key_encryption_key, | ||
727 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
728 | goto out_release_free_unlock; | ||
729 | } | ||
730 | rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, | ||
731 | s->block_aligned_filename_size); | ||
732 | if (rc) { | ||
733 | printk(KERN_ERR "%s: Error attempting to encrypt filename; " | ||
734 | "rc = [%d]\n", __func__, rc); | ||
735 | goto out_release_free_unlock; | ||
736 | } | ||
737 | s->i += s->block_aligned_filename_size; | ||
738 | (*packet_size) = s->i; | ||
739 | (*remaining_bytes) -= (*packet_size); | ||
740 | out_release_free_unlock: | ||
741 | crypto_free_hash(s->hash_desc.tfm); | ||
742 | out_free_unlock: | ||
743 | memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); | ||
744 | kfree(s->block_aligned_filename); | ||
745 | out_unlock: | ||
746 | mutex_unlock(s->tfm_mutex); | ||
747 | out: | ||
748 | kfree(s); | ||
749 | return rc; | ||
750 | } | ||
751 | |||
752 | struct ecryptfs_parse_tag_70_packet_silly_stack { | ||
753 | u8 cipher_code; | ||
754 | size_t max_packet_size; | ||
755 | size_t packet_size_len; | ||
756 | size_t parsed_tag_70_packet_size; | ||
757 | size_t block_aligned_filename_size; | ||
758 | size_t block_size; | ||
759 | size_t i; | ||
760 | struct mutex *tfm_mutex; | ||
761 | char *decrypted_filename; | ||
762 | struct ecryptfs_auth_tok *auth_tok; | ||
763 | struct scatterlist src_sg; | ||
764 | struct scatterlist dst_sg; | ||
765 | struct blkcipher_desc desc; | ||
766 | char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; | ||
767 | char iv[ECRYPTFS_MAX_IV_BYTES]; | ||
768 | char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; | ||
769 | }; | ||
770 | |||
771 | /** | ||
772 | * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet | ||
773 | * @filename: This function kmalloc's the memory for the filename | ||
774 | * @filename_size: This function sets this to the amount of memory | ||
775 | * kmalloc'd for the filename | ||
776 | * @packet_size: This function sets this to the the number of octets | ||
777 | * in the packet parsed | ||
778 | * @mount_crypt_stat: The mount-wide cryptographic context | ||
779 | * @data: The memory location containing the start of the tag 70 | ||
780 | * packet | ||
781 | * @max_packet_size: The maximum legal size of the packet to be parsed | ||
782 | * from @data | ||
783 | * | ||
784 | * Returns zero on success; non-zero otherwise | ||
785 | */ | ||
786 | int | ||
787 | ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, | ||
788 | size_t *packet_size, | ||
789 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, | ||
790 | char *data, size_t max_packet_size) | ||
791 | { | ||
792 | struct ecryptfs_parse_tag_70_packet_silly_stack *s; | ||
793 | int rc = 0; | ||
794 | |||
795 | (*packet_size) = 0; | ||
796 | (*filename_size) = 0; | ||
797 | (*filename) = NULL; | ||
798 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
799 | if (!s) { | ||
800 | printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " | ||
801 | "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); | ||
802 | goto out; | ||
803 | } | ||
804 | s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
805 | if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { | ||
806 | printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " | ||
807 | "at least [%d]\n", __func__, max_packet_size, | ||
808 | (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); | ||
809 | rc = -EINVAL; | ||
810 | goto out; | ||
811 | } | ||
812 | /* Octet 0: Tag 70 identifier | ||
813 | * Octets 1-N1: Tag 70 packet size (includes cipher identifier | ||
814 | * and block-aligned encrypted filename size) | ||
815 | * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) | ||
816 | * Octet N2-N3: Cipher identifier (1 octet) | ||
817 | * Octets N3-N4: Block-aligned encrypted filename | ||
818 | * - Consists of a minimum number of random numbers, a \0 | ||
819 | * separator, and then the filename */ | ||
820 | if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { | ||
821 | printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " | ||
822 | "tag [0x%.2x]\n", __func__, | ||
823 | data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); | ||
824 | rc = -EINVAL; | ||
825 | goto out; | ||
826 | } | ||
827 | rc = ecryptfs_parse_packet_length(&data[(*packet_size)], | ||
828 | &s->parsed_tag_70_packet_size, | ||
829 | &s->packet_size_len); | ||
830 | if (rc) { | ||
831 | printk(KERN_WARNING "%s: Error parsing packet length; " | ||
832 | "rc = [%d]\n", __func__, rc); | ||
833 | goto out; | ||
834 | } | ||
835 | s->block_aligned_filename_size = (s->parsed_tag_70_packet_size | ||
836 | - ECRYPTFS_SIG_SIZE - 1); | ||
837 | if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) | ||
838 | > max_packet_size) { | ||
839 | printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " | ||
840 | "size is [%zd]\n", __func__, max_packet_size, | ||
841 | (1 + s->packet_size_len + 1 | ||
842 | + s->block_aligned_filename_size)); | ||
843 | rc = -EINVAL; | ||
844 | goto out; | ||
845 | } | ||
846 | (*packet_size) += s->packet_size_len; | ||
847 | ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], | ||
848 | ECRYPTFS_SIG_SIZE); | ||
849 | s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; | ||
850 | (*packet_size) += ECRYPTFS_SIG_SIZE; | ||
851 | s->cipher_code = data[(*packet_size)++]; | ||
852 | rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); | ||
853 | if (rc) { | ||
854 | printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", | ||
855 | __func__, s->cipher_code); | ||
856 | goto out; | ||
857 | } | ||
858 | rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, | ||
859 | &s->tfm_mutex, | ||
860 | s->cipher_string); | ||
861 | if (unlikely(rc)) { | ||
862 | printk(KERN_ERR "Internal error whilst attempting to get " | ||
863 | "tfm and mutex for cipher name [%s]; rc = [%d]\n", | ||
864 | s->cipher_string, rc); | ||
865 | goto out; | ||
866 | } | ||
867 | mutex_lock(s->tfm_mutex); | ||
868 | rc = virt_to_scatterlist(&data[(*packet_size)], | ||
869 | s->block_aligned_filename_size, &s->src_sg, 1); | ||
870 | if (rc != 1) { | ||
871 | printk(KERN_ERR "%s: Internal error whilst attempting to " | ||
872 | "convert encrypted filename memory to scatterlist; " | ||
873 | "expected rc = 1; got rc = [%d]. " | ||
874 | "block_aligned_filename_size = [%zd]\n", __func__, rc, | ||
875 | s->block_aligned_filename_size); | ||
876 | goto out_unlock; | ||
877 | } | ||
878 | (*packet_size) += s->block_aligned_filename_size; | ||
879 | s->decrypted_filename = kmalloc(s->block_aligned_filename_size, | ||
880 | GFP_KERNEL); | ||
881 | if (!s->decrypted_filename) { | ||
882 | printk(KERN_ERR "%s: Out of memory whilst attempting to " | ||
883 | "kmalloc [%zd] bytes\n", __func__, | ||
884 | s->block_aligned_filename_size); | ||
885 | rc = -ENOMEM; | ||
886 | goto out_unlock; | ||
887 | } | ||
888 | rc = virt_to_scatterlist(s->decrypted_filename, | ||
889 | s->block_aligned_filename_size, &s->dst_sg, 1); | ||
890 | if (rc != 1) { | ||
891 | printk(KERN_ERR "%s: Internal error whilst attempting to " | ||
892 | "convert decrypted filename memory to scatterlist; " | ||
893 | "expected rc = 1; got rc = [%d]. " | ||
894 | "block_aligned_filename_size = [%zd]\n", __func__, rc, | ||
895 | s->block_aligned_filename_size); | ||
896 | goto out_free_unlock; | ||
897 | } | ||
898 | /* The characters in the first block effectively do the job of | ||
899 | * the IV here, so we just use 0's for the IV. Note the | ||
900 | * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES | ||
901 | * >= ECRYPTFS_MAX_IV_BYTES. */ | ||
902 | memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); | ||
903 | s->desc.info = s->iv; | ||
904 | rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, | ||
905 | s->fnek_sig_hex); | ||
906 | if (rc) { | ||
907 | printk(KERN_ERR "%s: Error attempting to find auth tok for " | ||
908 | "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, | ||
909 | rc); | ||
910 | goto out_free_unlock; | ||
911 | } | ||
912 | /* TODO: Support other key modules than passphrase for | ||
913 | * filename encryption */ | ||
914 | BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); | ||
915 | rc = crypto_blkcipher_setkey( | ||
916 | s->desc.tfm, | ||
917 | s->auth_tok->token.password.session_key_encryption_key, | ||
918 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
919 | if (rc < 0) { | ||
920 | printk(KERN_ERR "%s: Error setting key for crypto context; " | ||
921 | "rc = [%d]. s->auth_tok->token.password.session_key_" | ||
922 | "encryption_key = [0x%p]; mount_crypt_stat->" | ||
923 | "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, | ||
924 | rc, | ||
925 | s->auth_tok->token.password.session_key_encryption_key, | ||
926 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
927 | goto out_free_unlock; | ||
928 | } | ||
929 | rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, | ||
930 | s->block_aligned_filename_size); | ||
931 | if (rc) { | ||
932 | printk(KERN_ERR "%s: Error attempting to decrypt filename; " | ||
933 | "rc = [%d]\n", __func__, rc); | ||
934 | goto out_free_unlock; | ||
935 | } | ||
936 | s->i = 0; | ||
937 | while (s->decrypted_filename[s->i] != '\0' | ||
938 | && s->i < s->block_aligned_filename_size) | ||
939 | s->i++; | ||
940 | if (s->i == s->block_aligned_filename_size) { | ||
941 | printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " | ||
942 | "find valid separator between random characters and " | ||
943 | "the filename\n", __func__); | ||
944 | rc = -EINVAL; | ||
945 | goto out_free_unlock; | ||
946 | } | ||
947 | s->i++; | ||
948 | (*filename_size) = (s->block_aligned_filename_size - s->i); | ||
949 | if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { | ||
950 | printk(KERN_WARNING "%s: Filename size is [%zd], which is " | ||
951 | "invalid\n", __func__, (*filename_size)); | ||
952 | rc = -EINVAL; | ||
953 | goto out_free_unlock; | ||
954 | } | ||
955 | (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); | ||
956 | if (!(*filename)) { | ||
957 | printk(KERN_ERR "%s: Out of memory whilst attempting to " | ||
958 | "kmalloc [%zd] bytes\n", __func__, | ||
959 | ((*filename_size) + 1)); | ||
960 | rc = -ENOMEM; | ||
961 | goto out_free_unlock; | ||
962 | } | ||
963 | memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); | ||
964 | (*filename)[(*filename_size)] = '\0'; | ||
965 | out_free_unlock: | ||
966 | kfree(s->decrypted_filename); | ||
967 | out_unlock: | ||
968 | mutex_unlock(s->tfm_mutex); | ||
969 | out: | ||
970 | if (rc) { | ||
971 | (*packet_size) = 0; | ||
972 | (*filename_size) = 0; | ||
973 | (*filename) = NULL; | ||
974 | } | ||
975 | kfree(s); | ||
976 | return rc; | ||
977 | } | ||
978 | |||
979 | static int | ||
406 | ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) | 980 | ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) |
407 | { | 981 | { |
408 | int rc = 0; | 982 | int rc = 0; |
@@ -897,30 +1471,6 @@ out: | |||
897 | return rc; | 1471 | return rc; |
898 | } | 1472 | } |
899 | 1473 | ||
900 | static int | ||
901 | ecryptfs_find_global_auth_tok_for_sig( | ||
902 | struct ecryptfs_global_auth_tok **global_auth_tok, | ||
903 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) | ||
904 | { | ||
905 | struct ecryptfs_global_auth_tok *walker; | ||
906 | int rc = 0; | ||
907 | |||
908 | (*global_auth_tok) = NULL; | ||
909 | mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); | ||
910 | list_for_each_entry(walker, | ||
911 | &mount_crypt_stat->global_auth_tok_list, | ||
912 | mount_crypt_stat_list) { | ||
913 | if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { | ||
914 | (*global_auth_tok) = walker; | ||
915 | goto out; | ||
916 | } | ||
917 | } | ||
918 | rc = -EINVAL; | ||
919 | out: | ||
920 | mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); | ||
921 | return rc; | ||
922 | } | ||
923 | |||
924 | /** | 1474 | /** |
925 | * ecryptfs_verify_version | 1475 | * ecryptfs_verify_version |
926 | * @version: The version number to confirm | 1476 | * @version: The version number to confirm |
@@ -990,43 +1540,6 @@ out: | |||
990 | } | 1540 | } |
991 | 1541 | ||
992 | /** | 1542 | /** |
993 | * ecryptfs_find_auth_tok_for_sig | ||
994 | * @auth_tok: Set to the matching auth_tok; NULL if not found | ||
995 | * @crypt_stat: inode crypt_stat crypto context | ||
996 | * @sig: Sig of auth_tok to find | ||
997 | * | ||
998 | * For now, this function simply looks at the registered auth_tok's | ||
999 | * linked off the mount_crypt_stat, so all the auth_toks that can be | ||
1000 | * used must be registered at mount time. This function could | ||
1001 | * potentially try a lot harder to find auth_tok's (e.g., by calling | ||
1002 | * out to ecryptfsd to dynamically retrieve an auth_tok object) so | ||
1003 | * that static registration of auth_tok's will no longer be necessary. | ||
1004 | * | ||
1005 | * Returns zero on no error; non-zero on error | ||
1006 | */ | ||
1007 | static int | ||
1008 | ecryptfs_find_auth_tok_for_sig( | ||
1009 | struct ecryptfs_auth_tok **auth_tok, | ||
1010 | struct ecryptfs_crypt_stat *crypt_stat, char *sig) | ||
1011 | { | ||
1012 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = | ||
1013 | crypt_stat->mount_crypt_stat; | ||
1014 | struct ecryptfs_global_auth_tok *global_auth_tok; | ||
1015 | int rc = 0; | ||
1016 | |||
1017 | (*auth_tok) = NULL; | ||
1018 | if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, | ||
1019 | mount_crypt_stat, sig)) { | ||
1020 | struct key *auth_tok_key; | ||
1021 | |||
1022 | rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, | ||
1023 | sig); | ||
1024 | } else | ||
1025 | (*auth_tok) = global_auth_tok->global_auth_tok; | ||
1026 | return rc; | ||
1027 | } | ||
1028 | |||
1029 | /** | ||
1030 | * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. | 1543 | * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. |
1031 | * @auth_tok: The passphrase authentication token to use to encrypt the FEK | 1544 | * @auth_tok: The passphrase authentication token to use to encrypt the FEK |
1032 | * @crypt_stat: The cryptographic context | 1545 | * @crypt_stat: The cryptographic context |
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok: | |||
1256 | rc = -EINVAL; | 1769 | rc = -EINVAL; |
1257 | goto out_wipe_list; | 1770 | goto out_wipe_list; |
1258 | } | 1771 | } |
1259 | ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, | 1772 | ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, |
1773 | crypt_stat->mount_crypt_stat, | ||
1260 | candidate_auth_tok_sig); | 1774 | candidate_auth_tok_sig); |
1261 | if (matching_auth_tok) { | 1775 | if (matching_auth_tok) { |
1262 | found_auth_tok = 1; | 1776 | found_auth_tok = 1; |
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, | |||
1336 | int rc; | 1850 | int rc; |
1337 | 1851 | ||
1338 | rc = write_tag_66_packet(auth_tok->token.private_key.signature, | 1852 | rc = write_tag_66_packet(auth_tok->token.private_key.signature, |
1339 | ecryptfs_code_for_cipher_string(crypt_stat), | 1853 | ecryptfs_code_for_cipher_string( |
1854 | crypt_stat->cipher, | ||
1855 | crypt_stat->key_size), | ||
1340 | crypt_stat, &payload, &payload_len); | 1856 | crypt_stat, &payload, &payload_len); |
1341 | if (rc) { | 1857 | if (rc) { |
1342 | ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); | 1858 | ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); |
@@ -1696,7 +2212,8 @@ encrypted_session_key_set: | |||
1696 | dest[(*packet_size)++] = 0x04; /* version 4 */ | 2212 | dest[(*packet_size)++] = 0x04; /* version 4 */ |
1697 | /* TODO: Break from RFC2440 so that arbitrary ciphers can be | 2213 | /* TODO: Break from RFC2440 so that arbitrary ciphers can be |
1698 | * specified with strings */ | 2214 | * specified with strings */ |
1699 | cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); | 2215 | cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, |
2216 | crypt_stat->key_size); | ||
1700 | if (cipher_code == 0) { | 2217 | if (cipher_code == 0) { |
1701 | ecryptfs_printk(KERN_WARNING, "Unable to generate code for " | 2218 | ecryptfs_printk(KERN_WARNING, "Unable to generate code for " |
1702 | "cipher [%s]\n", crypt_stat->cipher); | 2219 | "cipher [%s]\n", crypt_stat->cipher); |
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index fd630713c5c7..789cf2e1be1e 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c | |||
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, | |||
206 | ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, | 206 | ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, |
207 | ecryptfs_opt_ecryptfs_key_bytes, | 207 | ecryptfs_opt_ecryptfs_key_bytes, |
208 | ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, | 208 | ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, |
209 | ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; | 209 | ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, |
210 | ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, | ||
211 | ecryptfs_opt_err }; | ||
210 | 212 | ||
211 | static const match_table_t tokens = { | 213 | static const match_table_t tokens = { |
212 | {ecryptfs_opt_sig, "sig=%s"}, | 214 | {ecryptfs_opt_sig, "sig=%s"}, |
@@ -217,6 +219,9 @@ static const match_table_t tokens = { | |||
217 | {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, | 219 | {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, |
218 | {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, | 220 | {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, |
219 | {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, | 221 | {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, |
222 | {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, | ||
223 | {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, | ||
224 | {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, | ||
220 | {ecryptfs_opt_err, NULL} | 225 | {ecryptfs_opt_err, NULL} |
221 | }; | 226 | }; |
222 | 227 | ||
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
281 | int rc = 0; | 286 | int rc = 0; |
282 | int sig_set = 0; | 287 | int sig_set = 0; |
283 | int cipher_name_set = 0; | 288 | int cipher_name_set = 0; |
289 | int fn_cipher_name_set = 0; | ||
284 | int cipher_key_bytes; | 290 | int cipher_key_bytes; |
285 | int cipher_key_bytes_set = 0; | 291 | int cipher_key_bytes_set = 0; |
292 | int fn_cipher_key_bytes; | ||
293 | int fn_cipher_key_bytes_set = 0; | ||
286 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = | 294 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = |
287 | &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; | 295 | &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; |
288 | substring_t args[MAX_OPT_ARGS]; | 296 | substring_t args[MAX_OPT_ARGS]; |
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
290 | char *sig_src; | 298 | char *sig_src; |
291 | char *cipher_name_dst; | 299 | char *cipher_name_dst; |
292 | char *cipher_name_src; | 300 | char *cipher_name_src; |
301 | char *fn_cipher_name_dst; | ||
302 | char *fn_cipher_name_src; | ||
303 | char *fnek_dst; | ||
304 | char *fnek_src; | ||
293 | char *cipher_key_bytes_src; | 305 | char *cipher_key_bytes_src; |
306 | char *fn_cipher_key_bytes_src; | ||
294 | 307 | ||
295 | if (!options) { | 308 | if (!options) { |
296 | rc = -EINVAL; | 309 | rc = -EINVAL; |
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
322 | global_default_cipher_name; | 335 | global_default_cipher_name; |
323 | strncpy(cipher_name_dst, cipher_name_src, | 336 | strncpy(cipher_name_dst, cipher_name_src, |
324 | ECRYPTFS_MAX_CIPHER_NAME_SIZE); | 337 | ECRYPTFS_MAX_CIPHER_NAME_SIZE); |
325 | ecryptfs_printk(KERN_DEBUG, | 338 | cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; |
326 | "The mount_crypt_stat " | ||
327 | "global_default_cipher_name set to: " | ||
328 | "[%s]\n", cipher_name_dst); | ||
329 | cipher_name_set = 1; | 339 | cipher_name_set = 1; |
330 | break; | 340 | break; |
331 | case ecryptfs_opt_ecryptfs_key_bytes: | 341 | case ecryptfs_opt_ecryptfs_key_bytes: |
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
335 | &cipher_key_bytes_src, 0); | 345 | &cipher_key_bytes_src, 0); |
336 | mount_crypt_stat->global_default_cipher_key_size = | 346 | mount_crypt_stat->global_default_cipher_key_size = |
337 | cipher_key_bytes; | 347 | cipher_key_bytes; |
338 | ecryptfs_printk(KERN_DEBUG, | ||
339 | "The mount_crypt_stat " | ||
340 | "global_default_cipher_key_size " | ||
341 | "set to: [%d]\n", mount_crypt_stat-> | ||
342 | global_default_cipher_key_size); | ||
343 | cipher_key_bytes_set = 1; | 348 | cipher_key_bytes_set = 1; |
344 | break; | 349 | break; |
345 | case ecryptfs_opt_passthrough: | 350 | case ecryptfs_opt_passthrough: |
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
356 | mount_crypt_stat->flags |= | 361 | mount_crypt_stat->flags |= |
357 | ECRYPTFS_ENCRYPTED_VIEW_ENABLED; | 362 | ECRYPTFS_ENCRYPTED_VIEW_ENABLED; |
358 | break; | 363 | break; |
364 | case ecryptfs_opt_fnek_sig: | ||
365 | fnek_src = args[0].from; | ||
366 | fnek_dst = | ||
367 | mount_crypt_stat->global_default_fnek_sig; | ||
368 | strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); | ||
369 | mount_crypt_stat->global_default_fnek_sig[ | ||
370 | ECRYPTFS_SIG_SIZE_HEX] = '\0'; | ||
371 | rc = ecryptfs_add_global_auth_tok( | ||
372 | mount_crypt_stat, | ||
373 | mount_crypt_stat->global_default_fnek_sig); | ||
374 | if (rc) { | ||
375 | printk(KERN_ERR "Error attempting to register " | ||
376 | "global fnek sig [%s]; rc = [%d]\n", | ||
377 | mount_crypt_stat->global_default_fnek_sig, | ||
378 | rc); | ||
379 | goto out; | ||
380 | } | ||
381 | mount_crypt_stat->flags |= | ||
382 | (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES | ||
383 | | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); | ||
384 | break; | ||
385 | case ecryptfs_opt_fn_cipher: | ||
386 | fn_cipher_name_src = args[0].from; | ||
387 | fn_cipher_name_dst = | ||
388 | mount_crypt_stat->global_default_fn_cipher_name; | ||
389 | strncpy(fn_cipher_name_dst, fn_cipher_name_src, | ||
390 | ECRYPTFS_MAX_CIPHER_NAME_SIZE); | ||
391 | mount_crypt_stat->global_default_fn_cipher_name[ | ||
392 | ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; | ||
393 | fn_cipher_name_set = 1; | ||
394 | break; | ||
395 | case ecryptfs_opt_fn_cipher_key_bytes: | ||
396 | fn_cipher_key_bytes_src = args[0].from; | ||
397 | fn_cipher_key_bytes = | ||
398 | (int)simple_strtol(fn_cipher_key_bytes_src, | ||
399 | &fn_cipher_key_bytes_src, 0); | ||
400 | mount_crypt_stat->global_default_fn_cipher_key_bytes = | ||
401 | fn_cipher_key_bytes; | ||
402 | fn_cipher_key_bytes_set = 1; | ||
403 | break; | ||
359 | case ecryptfs_opt_err: | 404 | case ecryptfs_opt_err: |
360 | default: | 405 | default: |
361 | ecryptfs_printk(KERN_WARNING, | 406 | printk(KERN_WARNING |
362 | "eCryptfs: unrecognized option '%s'\n", | 407 | "%s: eCryptfs: unrecognized option [%s]\n", |
363 | p); | 408 | __func__, p); |
364 | } | 409 | } |
365 | } | 410 | } |
366 | if (!sig_set) { | 411 | if (!sig_set) { |
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) | |||
374 | int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); | 419 | int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); |
375 | 420 | ||
376 | BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); | 421 | BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); |
377 | |||
378 | strcpy(mount_crypt_stat->global_default_cipher_name, | 422 | strcpy(mount_crypt_stat->global_default_cipher_name, |
379 | ECRYPTFS_DEFAULT_CIPHER); | 423 | ECRYPTFS_DEFAULT_CIPHER); |
380 | } | 424 | } |
381 | if (!cipher_key_bytes_set) { | 425 | if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) |
426 | && !fn_cipher_name_set) | ||
427 | strcpy(mount_crypt_stat->global_default_fn_cipher_name, | ||
428 | mount_crypt_stat->global_default_cipher_name); | ||
429 | if (!cipher_key_bytes_set) | ||
382 | mount_crypt_stat->global_default_cipher_key_size = 0; | 430 | mount_crypt_stat->global_default_cipher_key_size = 0; |
383 | } | 431 | if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) |
432 | && !fn_cipher_key_bytes_set) | ||
433 | mount_crypt_stat->global_default_fn_cipher_key_bytes = | ||
434 | mount_crypt_stat->global_default_cipher_key_size; | ||
384 | mutex_lock(&key_tfm_list_mutex); | 435 | mutex_lock(&key_tfm_list_mutex); |
385 | if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, | 436 | if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, |
386 | NULL)) | 437 | NULL)) { |
387 | rc = ecryptfs_add_new_key_tfm( | 438 | rc = ecryptfs_add_new_key_tfm( |
388 | NULL, mount_crypt_stat->global_default_cipher_name, | 439 | NULL, mount_crypt_stat->global_default_cipher_name, |
389 | mount_crypt_stat->global_default_cipher_key_size); | 440 | mount_crypt_stat->global_default_cipher_key_size); |
390 | mutex_unlock(&key_tfm_list_mutex); | 441 | if (rc) { |
391 | if (rc) { | 442 | printk(KERN_ERR "Error attempting to initialize " |
392 | printk(KERN_ERR "Error attempting to initialize cipher with " | 443 | "cipher with name = [%s] and key size = [%td]; " |
393 | "name = [%s] and key size = [%td]; rc = [%d]\n", | 444 | "rc = [%d]\n", |
394 | mount_crypt_stat->global_default_cipher_name, | 445 | mount_crypt_stat->global_default_cipher_name, |
395 | mount_crypt_stat->global_default_cipher_key_size, rc); | 446 | mount_crypt_stat->global_default_cipher_key_size, |
396 | rc = -EINVAL; | 447 | rc); |
397 | goto out; | 448 | rc = -EINVAL; |
449 | mutex_unlock(&key_tfm_list_mutex); | ||
450 | goto out; | ||
451 | } | ||
398 | } | 452 | } |
453 | if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) | ||
454 | && !ecryptfs_tfm_exists( | ||
455 | mount_crypt_stat->global_default_fn_cipher_name, NULL)) { | ||
456 | rc = ecryptfs_add_new_key_tfm( | ||
457 | NULL, mount_crypt_stat->global_default_fn_cipher_name, | ||
458 | mount_crypt_stat->global_default_fn_cipher_key_bytes); | ||
459 | if (rc) { | ||
460 | printk(KERN_ERR "Error attempting to initialize " | ||
461 | "cipher with name = [%s] and key size = [%td]; " | ||
462 | "rc = [%d]\n", | ||
463 | mount_crypt_stat->global_default_fn_cipher_name, | ||
464 | mount_crypt_stat->global_default_fn_cipher_key_bytes, | ||
465 | rc); | ||
466 | rc = -EINVAL; | ||
467 | mutex_unlock(&key_tfm_list_mutex); | ||
468 | goto out; | ||
469 | } | ||
470 | } | ||
471 | mutex_unlock(&key_tfm_list_mutex); | ||
399 | rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); | 472 | rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); |
400 | if (rc) { | 473 | if (rc) |
401 | printk(KERN_WARNING "One or more global auth toks could not " | 474 | printk(KERN_WARNING "One or more global auth toks could not " |
402 | "properly register; rc = [%d]\n", rc); | 475 | "properly register; rc = [%d]\n", rc); |
403 | } | ||
404 | out: | 476 | out: |
405 | return rc; | 477 | return rc; |
406 | } | 478 | } |
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 6913f727624d..96ef51489e01 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c | |||
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, | |||
193 | (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); | 193 | (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); |
194 | if (!(*daemon)) { | 194 | if (!(*daemon)) { |
195 | rc = -ENOMEM; | 195 | rc = -ENOMEM; |
196 | printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " | 196 | printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " |
197 | "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); | 197 | "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); |
198 | goto out; | 198 | goto out; |
199 | } | 199 | } |
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, | |||
435 | msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); | 435 | msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); |
436 | if (!msg_ctx->msg) { | 436 | if (!msg_ctx->msg) { |
437 | rc = -ENOMEM; | 437 | rc = -ENOMEM; |
438 | printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " | 438 | printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " |
439 | "GFP_KERNEL memory\n", __func__, msg_size); | 439 | "GFP_KERNEL memory\n", __func__, msg_size); |
440 | goto unlock; | 440 | goto unlock; |
441 | } | 441 | } |
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index efd95a0ed1ea..a67fea655f49 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c | |||
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, | |||
199 | if (!msg_ctx->msg) { | 199 | if (!msg_ctx->msg) { |
200 | rc = -ENOMEM; | 200 | rc = -ENOMEM; |
201 | printk(KERN_ERR "%s: Out of memory whilst attempting " | 201 | printk(KERN_ERR "%s: Out of memory whilst attempting " |
202 | "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, | 202 | "to kmalloc(%zd, GFP_KERNEL)\n", __func__, |
203 | (sizeof(*msg_ctx->msg) + data_size)); | 203 | (sizeof(*msg_ctx->msg) + data_size)); |
204 | goto out_unlock; | 204 | goto out_unlock; |
205 | } | 205 | } |
@@ -322,7 +322,7 @@ check_list: | |||
322 | if (count < total_length) { | 322 | if (count < total_length) { |
323 | rc = 0; | 323 | rc = 0; |
324 | printk(KERN_WARNING "%s: Only given user buffer of " | 324 | printk(KERN_WARNING "%s: Only given user buffer of " |
325 | "size [%Zd], but we need [%Zd] to read the " | 325 | "size [%zd], but we need [%zd] to read the " |
326 | "pending message\n", __func__, count, total_length); | 326 | "pending message\n", __func__, count, total_length); |
327 | goto out_unlock_msg_ctx; | 327 | goto out_unlock_msg_ctx; |
328 | } | 328 | } |
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size, | |||
376 | 376 | ||
377 | if ((sizeof(*msg) + msg->data_len) != data_size) { | 377 | if ((sizeof(*msg) + msg->data_len) != data_size) { |
378 | printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " | 378 | printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " |
379 | "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, | 379 | "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, |
380 | (sizeof(*msg) + msg->data_len), data_size); | 380 | (sizeof(*msg) + msg->data_len), data_size); |
381 | rc = -EINVAL; | 381 | rc = -EINVAL; |
382 | goto out; | 382 | goto out; |
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, | |||
421 | data = kmalloc(count, GFP_KERNEL); | 421 | data = kmalloc(count, GFP_KERNEL); |
422 | if (!data) { | 422 | if (!data) { |
423 | printk(KERN_ERR "%s: Out of memory whilst attempting to " | 423 | printk(KERN_ERR "%s: Out of memory whilst attempting to " |
424 | "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); | 424 | "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); |
425 | goto out; | 425 | goto out; |
426 | } | 426 | } |
427 | rc = copy_from_user(data, buf, count); | 427 | rc = copy_from_user(data, buf, count); |
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, | |||
436 | case ECRYPTFS_MSG_RESPONSE: | 436 | case ECRYPTFS_MSG_RESPONSE: |
437 | if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { | 437 | if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { |
438 | printk(KERN_WARNING "%s: Minimum acceptable packet " | 438 | printk(KERN_WARNING "%s: Minimum acceptable packet " |
439 | "size is [%Zd], but amount of data written is " | 439 | "size is [%zd], but amount of data written is " |
440 | "only [%Zd]. Discarding response packet.\n", | 440 | "only [%zd]. Discarding response packet.\n", |
441 | __func__, | 441 | __func__, |
442 | (1 + 4 + 1 + sizeof(struct ecryptfs_message)), | 442 | (1 + 4 + 1 + sizeof(struct ecryptfs_message)), |
443 | count); | 443 | count); |
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, | |||
455 | } | 455 | } |
456 | i += packet_size_length; | 456 | i += packet_size_length; |
457 | if ((1 + 4 + packet_size_length + packet_size) != count) { | 457 | if ((1 + 4 + packet_size_length + packet_size) != count) { |
458 | printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" | 458 | printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" |
459 | " + packet_size([%Zd]))([%Zd]) != " | 459 | " + packet_size([%zd]))([%zd]) != " |
460 | "count([%Zd]). Invalid packet format.\n", | 460 | "count([%zd]). Invalid packet format.\n", |
461 | __func__, packet_size_length, packet_size, | 461 | __func__, packet_size_length, packet_size, |
462 | (1 + packet_size_length + packet_size), count); | 462 | (1 + packet_size_length + packet_size), count); |
463 | goto out_free; | 463 | goto out_free; |
@@ -232,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, | |||
232 | 232 | ||
233 | static int __bprm_mm_init(struct linux_binprm *bprm) | 233 | static int __bprm_mm_init(struct linux_binprm *bprm) |
234 | { | 234 | { |
235 | int err = -ENOMEM; | 235 | int err; |
236 | struct vm_area_struct *vma = NULL; | 236 | struct vm_area_struct *vma = NULL; |
237 | struct mm_struct *mm = bprm->mm; | 237 | struct mm_struct *mm = bprm->mm; |
238 | 238 | ||
239 | bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 239 | bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
240 | if (!vma) | 240 | if (!vma) |
241 | goto err; | 241 | return -ENOMEM; |
242 | 242 | ||
243 | down_write(&mm->mmap_sem); | 243 | down_write(&mm->mmap_sem); |
244 | vma->vm_mm = mm; | 244 | vma->vm_mm = mm; |
@@ -251,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm) | |||
251 | */ | 251 | */ |
252 | vma->vm_end = STACK_TOP_MAX; | 252 | vma->vm_end = STACK_TOP_MAX; |
253 | vma->vm_start = vma->vm_end - PAGE_SIZE; | 253 | vma->vm_start = vma->vm_end - PAGE_SIZE; |
254 | |||
255 | vma->vm_flags = VM_STACK_FLAGS; | 254 | vma->vm_flags = VM_STACK_FLAGS; |
256 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 255 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
257 | err = insert_vm_struct(mm, vma); | 256 | err = insert_vm_struct(mm, vma); |
258 | if (err) { | 257 | if (err) |
259 | up_write(&mm->mmap_sem); | ||
260 | goto err; | 258 | goto err; |
261 | } | ||
262 | 259 | ||
263 | mm->stack_vm = mm->total_vm = 1; | 260 | mm->stack_vm = mm->total_vm = 1; |
264 | up_write(&mm->mmap_sem); | 261 | up_write(&mm->mmap_sem); |
265 | |||
266 | bprm->p = vma->vm_end - sizeof(void *); | 262 | bprm->p = vma->vm_end - sizeof(void *); |
267 | |||
268 | return 0; | 263 | return 0; |
269 | |||
270 | err: | 264 | err: |
271 | if (vma) { | 265 | up_write(&mm->mmap_sem); |
272 | bprm->vma = NULL; | 266 | bprm->vma = NULL; |
273 | kmem_cache_free(vm_area_cachep, vma); | 267 | kmem_cache_free(vm_area_cachep, vma); |
274 | } | ||
275 | |||
276 | return err; | 268 | return err; |
277 | } | 269 | } |
278 | 270 | ||
@@ -1694,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm) | |||
1694 | return (ret >= 2) ? 2 : ret; | 1686 | return (ret >= 2) ? 2 : ret; |
1695 | } | 1687 | } |
1696 | 1688 | ||
1697 | int do_coredump(long signr, int exit_code, struct pt_regs * regs) | 1689 | void do_coredump(long signr, int exit_code, struct pt_regs *regs) |
1698 | { | 1690 | { |
1699 | struct core_state core_state; | 1691 | struct core_state core_state; |
1700 | char corename[CORENAME_MAX_SIZE + 1]; | 1692 | char corename[CORENAME_MAX_SIZE + 1]; |
@@ -1778,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) | |||
1778 | 1770 | ||
1779 | if (ispipe) { | 1771 | if (ispipe) { |
1780 | helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); | 1772 | helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); |
1773 | if (!helper_argv) { | ||
1774 | printk(KERN_WARNING "%s failed to allocate memory\n", | ||
1775 | __func__); | ||
1776 | goto fail_unlock; | ||
1777 | } | ||
1781 | /* Terminate the string before the first option */ | 1778 | /* Terminate the string before the first option */ |
1782 | delimit = strchr(corename, ' '); | 1779 | delimit = strchr(corename, ' '); |
1783 | if (delimit) | 1780 | if (delimit) |
@@ -1845,5 +1842,5 @@ fail_unlock: | |||
1845 | put_cred(cred); | 1842 | put_cred(cred); |
1846 | coredump_finish(mm); | 1843 | coredump_finish(mm); |
1847 | fail: | 1844 | fail: |
1848 | return retval; | 1845 | return; |
1849 | } | 1846 | } |
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index c454d5db28a5..66321a877e74 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c | |||
@@ -565,12 +565,8 @@ got: | |||
565 | inode->i_blocks = 0; | 565 | inode->i_blocks = 0; |
566 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; | 566 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; |
567 | memset(ei->i_data, 0, sizeof(ei->i_data)); | 567 | memset(ei->i_data, 0, sizeof(ei->i_data)); |
568 | ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; | 568 | ei->i_flags = |
569 | if (S_ISLNK(mode)) | 569 | ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); |
570 | ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL); | ||
571 | /* dirsync is only applied to directories */ | ||
572 | if (!S_ISDIR(mode)) | ||
573 | ei->i_flags &= ~EXT2_DIRSYNC_FL; | ||
574 | ei->i_faddr = 0; | 570 | ei->i_faddr = 0; |
575 | ei->i_frag_no = 0; | 571 | ei->i_frag_no = 0; |
576 | ei->i_frag_size = 0; | 572 | ei->i_frag_size = 0; |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 02b39a5deb74..23fff2f87783 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode, | |||
498 | * ext2_splice_branch - splice the allocated branch onto inode. | 498 | * ext2_splice_branch - splice the allocated branch onto inode. |
499 | * @inode: owner | 499 | * @inode: owner |
500 | * @block: (logical) number of block we are adding | 500 | * @block: (logical) number of block we are adding |
501 | * @chain: chain of indirect blocks (with a missing link - see | ||
502 | * ext2_alloc_branch) | ||
503 | * @where: location of missing link | 501 | * @where: location of missing link |
504 | * @num: number of indirect blocks we are adding | 502 | * @num: number of indirect blocks we are adding |
505 | * @blks: number of direct blocks we are adding | 503 | * @blks: number of direct blocks we are adding |
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index de876fa793e1..7cb4badef927 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c | |||
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
50 | goto setflags_out; | 50 | goto setflags_out; |
51 | } | 51 | } |
52 | 52 | ||
53 | if (!S_ISDIR(inode->i_mode)) | 53 | flags = ext2_mask_flags(inode->i_mode, flags); |
54 | flags &= ~EXT2_DIRSYNC_FL; | ||
55 | 54 | ||
56 | mutex_lock(&inode->i_mutex); | 55 | mutex_lock(&inode->i_mutex); |
57 | /* Is it quota file? Do not allow user to mess with it */ | 56 | /* Is it quota file? Do not allow user to mess with it */ |
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 647cd888ac87..da8bdeaa2e6d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c | |||
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb) | |||
132 | percpu_counter_destroy(&sbi->s_dirs_counter); | 132 | percpu_counter_destroy(&sbi->s_dirs_counter); |
133 | brelse (sbi->s_sbh); | 133 | brelse (sbi->s_sbh); |
134 | sb->s_fs_info = NULL; | 134 | sb->s_fs_info = NULL; |
135 | kfree(sbi->s_blockgroup_lock); | ||
135 | kfree(sbi); | 136 | kfree(sbi); |
136 | 137 | ||
137 | return; | 138 | return; |
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) | |||
756 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | 757 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); |
757 | if (!sbi) | 758 | if (!sbi) |
758 | return -ENOMEM; | 759 | return -ENOMEM; |
760 | |||
761 | sbi->s_blockgroup_lock = | ||
762 | kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); | ||
763 | if (!sbi->s_blockgroup_lock) { | ||
764 | kfree(sbi); | ||
765 | return -ENOMEM; | ||
766 | } | ||
759 | sb->s_fs_info = sbi; | 767 | sb->s_fs_info = sbi; |
760 | sbi->s_sb_block = sb_block; | 768 | sbi->s_sb_block = sb_block; |
761 | 769 | ||
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) | |||
983 | printk ("EXT2-fs: not enough memory\n"); | 991 | printk ("EXT2-fs: not enough memory\n"); |
984 | goto failed_mount; | 992 | goto failed_mount; |
985 | } | 993 | } |
986 | bgl_lock_init(&sbi->s_blockgroup_lock); | 994 | bgl_lock_init(sbi->s_blockgroup_lock); |
987 | sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); | 995 | sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); |
988 | if (!sbi->s_debts) { | 996 | if (!sbi->s_debts) { |
989 | printk ("EXT2-fs: not enough memory\n"); | 997 | printk ("EXT2-fs: not enough memory\n"); |
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2e..7d215b4d4f2e 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c | |||
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) | |||
35 | 35 | ||
36 | 36 | ||
37 | /* The old legacy hash */ | 37 | /* The old legacy hash */ |
38 | static __u32 dx_hack_hash (const char *name, int len) | 38 | static __u32 dx_hack_hash_unsigned(const char *name, int len) |
39 | { | 39 | { |
40 | __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | 40 | __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; |
41 | const unsigned char *ucp = (const unsigned char *) name; | ||
42 | |||
43 | while (len--) { | ||
44 | hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); | ||
45 | |||
46 | if (hash & 0x80000000) | ||
47 | hash -= 0x7fffffff; | ||
48 | hash1 = hash0; | ||
49 | hash0 = hash; | ||
50 | } | ||
51 | return hash0 << 1; | ||
52 | } | ||
53 | |||
54 | static __u32 dx_hack_hash_signed(const char *name, int len) | ||
55 | { | ||
56 | __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | ||
57 | const signed char *scp = (const signed char *) name; | ||
58 | |||
41 | while (len--) { | 59 | while (len--) { |
42 | __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); | 60 | hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); |
43 | 61 | ||
44 | if (hash & 0x80000000) hash -= 0x7fffffff; | 62 | if (hash & 0x80000000) |
63 | hash -= 0x7fffffff; | ||
45 | hash1 = hash0; | 64 | hash1 = hash0; |
46 | hash0 = hash; | 65 | hash0 = hash; |
47 | } | 66 | } |
48 | return (hash0 << 1); | 67 | return hash0 << 1; |
49 | } | 68 | } |
50 | 69 | ||
51 | static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | 70 | static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) |
52 | { | 71 | { |
53 | __u32 pad, val; | 72 | __u32 pad, val; |
54 | int i; | 73 | int i; |
74 | const signed char *scp = (const signed char *) msg; | ||
75 | |||
76 | pad = (__u32)len | ((__u32)len << 8); | ||
77 | pad |= pad << 16; | ||
78 | |||
79 | val = pad; | ||
80 | if (len > num*4) | ||
81 | len = num * 4; | ||
82 | for (i = 0; i < len; i++) { | ||
83 | if ((i % 4) == 0) | ||
84 | val = pad; | ||
85 | val = ((int) scp[i]) + (val << 8); | ||
86 | if ((i % 4) == 3) { | ||
87 | *buf++ = val; | ||
88 | val = pad; | ||
89 | num--; | ||
90 | } | ||
91 | } | ||
92 | if (--num >= 0) | ||
93 | *buf++ = val; | ||
94 | while (--num >= 0) | ||
95 | *buf++ = pad; | ||
96 | } | ||
97 | |||
98 | static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) | ||
99 | { | ||
100 | __u32 pad, val; | ||
101 | int i; | ||
102 | const unsigned char *ucp = (const unsigned char *) msg; | ||
55 | 103 | ||
56 | pad = (__u32)len | ((__u32)len << 8); | 104 | pad = (__u32)len | ((__u32)len << 8); |
57 | pad |= pad << 16; | 105 | pad |= pad << 16; |
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | |||
62 | for (i=0; i < len; i++) { | 110 | for (i=0; i < len; i++) { |
63 | if ((i % 4) == 0) | 111 | if ((i % 4) == 0) |
64 | val = pad; | 112 | val = pad; |
65 | val = msg[i] + (val << 8); | 113 | val = ((int) ucp[i]) + (val << 8); |
66 | if ((i % 4) == 3) { | 114 | if ((i % 4) == 3) { |
67 | *buf++ = val; | 115 | *buf++ = val; |
68 | val = pad; | 116 | val = pad; |
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
95 | const char *p; | 143 | const char *p; |
96 | int i; | 144 | int i; |
97 | __u32 in[8], buf[4]; | 145 | __u32 in[8], buf[4]; |
146 | void (*str2hashbuf)(const char *, int, __u32 *, int) = | ||
147 | str2hashbuf_signed; | ||
98 | 148 | ||
99 | /* Initialize the default seed for the hash checksum functions */ | 149 | /* Initialize the default seed for the hash checksum functions */ |
100 | buf[0] = 0x67452301; | 150 | buf[0] = 0x67452301; |
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
113 | } | 163 | } |
114 | 164 | ||
115 | switch (hinfo->hash_version) { | 165 | switch (hinfo->hash_version) { |
166 | case DX_HASH_LEGACY_UNSIGNED: | ||
167 | hash = dx_hack_hash_unsigned(name, len); | ||
168 | break; | ||
116 | case DX_HASH_LEGACY: | 169 | case DX_HASH_LEGACY: |
117 | hash = dx_hack_hash(name, len); | 170 | hash = dx_hack_hash_signed(name, len); |
118 | break; | 171 | break; |
172 | case DX_HASH_HALF_MD4_UNSIGNED: | ||
173 | str2hashbuf = str2hashbuf_unsigned; | ||
119 | case DX_HASH_HALF_MD4: | 174 | case DX_HASH_HALF_MD4: |
120 | p = name; | 175 | p = name; |
121 | while (len > 0) { | 176 | while (len > 0) { |
122 | str2hashbuf(p, len, in, 8); | 177 | (*str2hashbuf)(p, len, in, 8); |
123 | half_md4_transform(buf, in); | 178 | half_md4_transform(buf, in); |
124 | len -= 32; | 179 | len -= 32; |
125 | p += 32; | 180 | p += 32; |
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
127 | minor_hash = buf[2]; | 182 | minor_hash = buf[2]; |
128 | hash = buf[1]; | 183 | hash = buf[1]; |
129 | break; | 184 | break; |
185 | case DX_HASH_TEA_UNSIGNED: | ||
186 | str2hashbuf = str2hashbuf_unsigned; | ||
130 | case DX_HASH_TEA: | 187 | case DX_HASH_TEA: |
131 | p = name; | 188 | p = name; |
132 | while (len > 0) { | 189 | while (len > 0) { |
133 | str2hashbuf(p, len, in, 4); | 190 | (*str2hashbuf)(p, len, in, 4); |
134 | TEA_transform(buf, in); | 191 | TEA_transform(buf, in); |
135 | len -= 16; | 192 | len -= 16; |
136 | p += 16; | 193 | p += 16; |
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 5655fbcbd11f..8de6c720e510 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c | |||
@@ -559,12 +559,8 @@ got: | |||
559 | ei->i_dir_start_lookup = 0; | 559 | ei->i_dir_start_lookup = 0; |
560 | ei->i_disksize = 0; | 560 | ei->i_disksize = 0; |
561 | 561 | ||
562 | ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; | 562 | ei->i_flags = |
563 | if (S_ISLNK(mode)) | 563 | ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); |
564 | ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); | ||
565 | /* dirsync only applies to directories */ | ||
566 | if (!S_ISDIR(mode)) | ||
567 | ei->i_flags &= ~EXT3_DIRSYNC_FL; | ||
568 | #ifdef EXT3_FRAGMENTS | 564 | #ifdef EXT3_FRAGMENTS |
569 | ei->i_faddr = 0; | 565 | ei->i_faddr = 0; |
570 | ei->i_frag_no = 0; | 566 | ei->i_frag_no = 0; |
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index b7394d05ee8e..5e86ce9a86e0 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c | |||
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, | |||
53 | goto flags_out; | 53 | goto flags_out; |
54 | } | 54 | } |
55 | 55 | ||
56 | if (!S_ISDIR(inode->i_mode)) | 56 | flags = ext3_mask_flags(inode->i_mode, flags); |
57 | flags &= ~EXT3_DIRSYNC_FL; | ||
58 | 57 | ||
59 | mutex_lock(&inode->i_mutex); | 58 | mutex_lock(&inode->i_mutex); |
60 | /* Is it quota file? Do not allow user to mess with it */ | 59 | /* Is it quota file? Do not allow user to mess with it */ |
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1dd2abe6313e..69a3d19ca9fd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c | |||
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle, | |||
74 | #define assert(test) J_ASSERT(test) | 74 | #define assert(test) J_ASSERT(test) |
75 | #endif | 75 | #endif |
76 | 76 | ||
77 | #ifndef swap | ||
78 | #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) | ||
79 | #endif | ||
80 | |||
81 | #ifdef DX_DEBUG | 77 | #ifdef DX_DEBUG |
82 | #define dxtrace(command) command | 78 | #define dxtrace(command) command |
83 | #else | 79 | #else |
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir, | |||
368 | goto fail; | 364 | goto fail; |
369 | } | 365 | } |
370 | hinfo->hash_version = root->info.hash_version; | 366 | hinfo->hash_version = root->info.hash_version; |
367 | if (hinfo->hash_version <= DX_HASH_TEA) | ||
368 | hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; | ||
371 | hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; | 369 | hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; |
372 | if (entry) | 370 | if (entry) |
373 | ext3fs_dirhash(entry->name, entry->len, hinfo); | 371 | ext3fs_dirhash(entry->name, entry->len, hinfo); |
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, | |||
636 | dir = dir_file->f_path.dentry->d_inode; | 634 | dir = dir_file->f_path.dentry->d_inode; |
637 | if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { | 635 | if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { |
638 | hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; | 636 | hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; |
637 | if (hinfo.hash_version <= DX_HASH_TEA) | ||
638 | hinfo.hash_version += | ||
639 | EXT3_SB(dir->i_sb)->s_hash_unsigned; | ||
639 | hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; | 640 | hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; |
640 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, | 641 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, |
641 | start_hash, start_minor_hash); | 642 | start_hash, start_minor_hash); |
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, | |||
1156 | u32 hash2; | 1157 | u32 hash2; |
1157 | struct dx_map_entry *map; | 1158 | struct dx_map_entry *map; |
1158 | char *data1 = (*bh)->b_data, *data2; | 1159 | char *data1 = (*bh)->b_data, *data2; |
1159 | unsigned split, move, size, i; | 1160 | unsigned split, move, size; |
1160 | struct ext3_dir_entry_2 *de = NULL, *de2; | 1161 | struct ext3_dir_entry_2 *de = NULL, *de2; |
1161 | int err = 0; | 1162 | int err = 0, i; |
1162 | 1163 | ||
1163 | bh2 = ext3_append (handle, dir, &newblock, &err); | 1164 | bh2 = ext3_append (handle, dir, &newblock, &err); |
1164 | if (!(bh2)) { | 1165 | if (!(bh2)) { |
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | |||
1398 | 1399 | ||
1399 | /* Initialize as for dx_probe */ | 1400 | /* Initialize as for dx_probe */ |
1400 | hinfo.hash_version = root->info.hash_version; | 1401 | hinfo.hash_version = root->info.hash_version; |
1402 | if (hinfo.hash_version <= DX_HASH_TEA) | ||
1403 | hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; | ||
1401 | hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; | 1404 | hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; |
1402 | ext3fs_dirhash(name, namelen, &hinfo); | 1405 | ext3fs_dirhash(name, namelen, &hinfo); |
1403 | frame = frames; | 1406 | frame = frames; |
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c22d01467bd1..b70d90e08a3c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c | |||
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *, | |||
48 | unsigned long journal_devnum); | 48 | unsigned long journal_devnum); |
49 | static int ext3_create_journal(struct super_block *, struct ext3_super_block *, | 49 | static int ext3_create_journal(struct super_block *, struct ext3_super_block *, |
50 | unsigned int); | 50 | unsigned int); |
51 | static void ext3_commit_super (struct super_block * sb, | 51 | static int ext3_commit_super(struct super_block *sb, |
52 | struct ext3_super_block * es, | 52 | struct ext3_super_block *es, |
53 | int sync); | 53 | int sync); |
54 | static void ext3_mark_recovery_complete(struct super_block * sb, | 54 | static void ext3_mark_recovery_complete(struct super_block * sb, |
55 | struct ext3_super_block * es); | 55 | struct ext3_super_block * es); |
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, | |||
60 | char nbuf[16]); | 60 | char nbuf[16]); |
61 | static int ext3_remount (struct super_block * sb, int * flags, char * data); | 61 | static int ext3_remount (struct super_block * sb, int * flags, char * data); |
62 | static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); | 62 | static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); |
63 | static void ext3_unlockfs(struct super_block *sb); | 63 | static int ext3_unfreeze(struct super_block *sb); |
64 | static void ext3_write_super (struct super_block * sb); | 64 | static void ext3_write_super (struct super_block * sb); |
65 | static void ext3_write_super_lockfs(struct super_block *sb); | 65 | static int ext3_freeze(struct super_block *sb); |
66 | 66 | ||
67 | /* | 67 | /* |
68 | * Wrappers for journal_start/end. | 68 | * Wrappers for journal_start/end. |
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb) | |||
439 | ext3_blkdev_remove(sbi); | 439 | ext3_blkdev_remove(sbi); |
440 | } | 440 | } |
441 | sb->s_fs_info = NULL; | 441 | sb->s_fs_info = NULL; |
442 | kfree(sbi->s_blockgroup_lock); | ||
442 | kfree(sbi); | 443 | kfree(sbi); |
443 | return; | 444 | return; |
444 | } | 445 | } |
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, | |||
682 | ext3_nfs_get_inode); | 683 | ext3_nfs_get_inode); |
683 | } | 684 | } |
684 | 685 | ||
686 | /* | ||
687 | * Try to release metadata pages (indirect blocks, directories) which are | ||
688 | * mapped via the block device. Since these pages could have journal heads | ||
689 | * which would prevent try_to_free_buffers() from freeing them, we must use | ||
690 | * jbd layer's try_to_free_buffers() function to release them. | ||
691 | */ | ||
692 | static int bdev_try_to_free_page(struct super_block *sb, struct page *page, | ||
693 | gfp_t wait) | ||
694 | { | ||
695 | journal_t *journal = EXT3_SB(sb)->s_journal; | ||
696 | |||
697 | WARN_ON(PageChecked(page)); | ||
698 | if (!page_has_buffers(page)) | ||
699 | return 0; | ||
700 | if (journal) | ||
701 | return journal_try_to_free_buffers(journal, page, | ||
702 | wait & ~__GFP_WAIT); | ||
703 | return try_to_free_buffers(page); | ||
704 | } | ||
705 | |||
685 | #ifdef CONFIG_QUOTA | 706 | #ifdef CONFIG_QUOTA |
686 | #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") | 707 | #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") |
687 | #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) | 708 | #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) |
@@ -738,8 +759,8 @@ static const struct super_operations ext3_sops = { | |||
738 | .put_super = ext3_put_super, | 759 | .put_super = ext3_put_super, |
739 | .write_super = ext3_write_super, | 760 | .write_super = ext3_write_super, |
740 | .sync_fs = ext3_sync_fs, | 761 | .sync_fs = ext3_sync_fs, |
741 | .write_super_lockfs = ext3_write_super_lockfs, | 762 | .freeze_fs = ext3_freeze, |
742 | .unlockfs = ext3_unlockfs, | 763 | .unfreeze_fs = ext3_unfreeze, |
743 | .statfs = ext3_statfs, | 764 | .statfs = ext3_statfs, |
744 | .remount_fs = ext3_remount, | 765 | .remount_fs = ext3_remount, |
745 | .clear_inode = ext3_clear_inode, | 766 | .clear_inode = ext3_clear_inode, |
@@ -748,6 +769,7 @@ static const struct super_operations ext3_sops = { | |||
748 | .quota_read = ext3_quota_read, | 769 | .quota_read = ext3_quota_read, |
749 | .quota_write = ext3_quota_write, | 770 | .quota_write = ext3_quota_write, |
750 | #endif | 771 | #endif |
772 | .bdev_try_to_free_page = bdev_try_to_free_page, | ||
751 | }; | 773 | }; |
752 | 774 | ||
753 | static const struct export_operations ext3_export_ops = { | 775 | static const struct export_operations ext3_export_ops = { |
@@ -1546,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) | |||
1546 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | 1568 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); |
1547 | if (!sbi) | 1569 | if (!sbi) |
1548 | return -ENOMEM; | 1570 | return -ENOMEM; |
1571 | |||
1572 | sbi->s_blockgroup_lock = | ||
1573 | kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); | ||
1574 | if (!sbi->s_blockgroup_lock) { | ||
1575 | kfree(sbi); | ||
1576 | return -ENOMEM; | ||
1577 | } | ||
1549 | sb->s_fs_info = sbi; | 1578 | sb->s_fs_info = sbi; |
1550 | sbi->s_mount_opt = 0; | 1579 | sbi->s_mount_opt = 0; |
1551 | sbi->s_resuid = EXT3_DEF_RESUID; | 1580 | sbi->s_resuid = EXT3_DEF_RESUID; |
@@ -1742,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) | |||
1742 | for (i=0; i < 4; i++) | 1771 | for (i=0; i < 4; i++) |
1743 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); | 1772 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); |
1744 | sbi->s_def_hash_version = es->s_def_hash_version; | 1773 | sbi->s_def_hash_version = es->s_def_hash_version; |
1774 | i = le32_to_cpu(es->s_flags); | ||
1775 | if (i & EXT2_FLAGS_UNSIGNED_HASH) | ||
1776 | sbi->s_hash_unsigned = 3; | ||
1777 | else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { | ||
1778 | #ifdef __CHAR_UNSIGNED__ | ||
1779 | es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); | ||
1780 | sbi->s_hash_unsigned = 3; | ||
1781 | #else | ||
1782 | es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); | ||
1783 | #endif | ||
1784 | sb->s_dirt = 1; | ||
1785 | } | ||
1745 | 1786 | ||
1746 | if (sbi->s_blocks_per_group > blocksize * 8) { | 1787 | if (sbi->s_blocks_per_group > blocksize * 8) { |
1747 | printk (KERN_ERR | 1788 | printk (KERN_ERR |
@@ -1786,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) | |||
1786 | goto failed_mount; | 1827 | goto failed_mount; |
1787 | } | 1828 | } |
1788 | 1829 | ||
1789 | bgl_lock_init(&sbi->s_blockgroup_lock); | 1830 | bgl_lock_init(sbi->s_blockgroup_lock); |
1790 | 1831 | ||
1791 | for (i = 0; i < db_count; i++) { | 1832 | for (i = 0; i < db_count; i++) { |
1792 | block = descriptor_loc(sb, logic_sb_block, i); | 1833 | block = descriptor_loc(sb, logic_sb_block, i); |
@@ -2270,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb, | |||
2270 | return 0; | 2311 | return 0; |
2271 | } | 2312 | } |
2272 | 2313 | ||
2273 | static void ext3_commit_super (struct super_block * sb, | 2314 | static int ext3_commit_super(struct super_block *sb, |
2274 | struct ext3_super_block * es, | 2315 | struct ext3_super_block *es, |
2275 | int sync) | 2316 | int sync) |
2276 | { | 2317 | { |
2277 | struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; | 2318 | struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; |
2319 | int error = 0; | ||
2278 | 2320 | ||
2279 | if (!sbh) | 2321 | if (!sbh) |
2280 | return; | 2322 | return error; |
2281 | es->s_wtime = cpu_to_le32(get_seconds()); | 2323 | es->s_wtime = cpu_to_le32(get_seconds()); |
2282 | es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); | 2324 | es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); |
2283 | es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); | 2325 | es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); |
2284 | BUFFER_TRACE(sbh, "marking dirty"); | 2326 | BUFFER_TRACE(sbh, "marking dirty"); |
2285 | mark_buffer_dirty(sbh); | 2327 | mark_buffer_dirty(sbh); |
2286 | if (sync) | 2328 | if (sync) |
2287 | sync_dirty_buffer(sbh); | 2329 | error = sync_dirty_buffer(sbh); |
2330 | return error; | ||
2288 | } | 2331 | } |
2289 | 2332 | ||
2290 | 2333 | ||
@@ -2398,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait) | |||
2398 | * LVM calls this function before a (read-only) snapshot is created. This | 2441 | * LVM calls this function before a (read-only) snapshot is created. This |
2399 | * gives us a chance to flush the journal completely and mark the fs clean. | 2442 | * gives us a chance to flush the journal completely and mark the fs clean. |
2400 | */ | 2443 | */ |
2401 | static void ext3_write_super_lockfs(struct super_block *sb) | 2444 | static int ext3_freeze(struct super_block *sb) |
2402 | { | 2445 | { |
2446 | int error = 0; | ||
2447 | journal_t *journal; | ||
2403 | sb->s_dirt = 0; | 2448 | sb->s_dirt = 0; |
2404 | 2449 | ||
2405 | if (!(sb->s_flags & MS_RDONLY)) { | 2450 | if (!(sb->s_flags & MS_RDONLY)) { |
2406 | journal_t *journal = EXT3_SB(sb)->s_journal; | 2451 | journal = EXT3_SB(sb)->s_journal; |
2407 | 2452 | ||
2408 | /* Now we set up the journal barrier. */ | 2453 | /* Now we set up the journal barrier. */ |
2409 | journal_lock_updates(journal); | 2454 | journal_lock_updates(journal); |
@@ -2412,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb) | |||
2412 | * We don't want to clear needs_recovery flag when we failed | 2457 | * We don't want to clear needs_recovery flag when we failed |
2413 | * to flush the journal. | 2458 | * to flush the journal. |
2414 | */ | 2459 | */ |
2415 | if (journal_flush(journal) < 0) | 2460 | error = journal_flush(journal); |
2416 | return; | 2461 | if (error < 0) |
2462 | goto out; | ||
2417 | 2463 | ||
2418 | /* Journal blocked and flushed, clear needs_recovery flag. */ | 2464 | /* Journal blocked and flushed, clear needs_recovery flag. */ |
2419 | EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); | 2465 | EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); |
2420 | ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); | 2466 | error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); |
2467 | if (error) | ||
2468 | goto out; | ||
2421 | } | 2469 | } |
2470 | return 0; | ||
2471 | |||
2472 | out: | ||
2473 | journal_unlock_updates(journal); | ||
2474 | return error; | ||
2422 | } | 2475 | } |
2423 | 2476 | ||
2424 | /* | 2477 | /* |
2425 | * Called by LVM after the snapshot is done. We need to reset the RECOVER | 2478 | * Called by LVM after the snapshot is done. We need to reset the RECOVER |
2426 | * flag here, even though the filesystem is not technically dirty yet. | 2479 | * flag here, even though the filesystem is not technically dirty yet. |
2427 | */ | 2480 | */ |
2428 | static void ext3_unlockfs(struct super_block *sb) | 2481 | static int ext3_unfreeze(struct super_block *sb) |
2429 | { | 2482 | { |
2430 | if (!(sb->s_flags & MS_RDONLY)) { | 2483 | if (!(sb->s_flags & MS_RDONLY)) { |
2431 | lock_super(sb); | 2484 | lock_super(sb); |
@@ -2435,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb) | |||
2435 | unlock_super(sb); | 2488 | unlock_super(sb); |
2436 | journal_unlock_updates(EXT3_SB(sb)->s_journal); | 2489 | journal_unlock_updates(EXT3_SB(sb)->s_journal); |
2437 | } | 2490 | } |
2491 | return 0; | ||
2438 | } | 2492 | } |
2439 | 2493 | ||
2440 | static int ext3_remount (struct super_block * sb, int * flags, char * data) | 2494 | static int ext3_remount (struct super_block * sb, int * flags, char * data) |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38b3acf5683b..6bba06b09dd1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include "ext4.h" | 20 | #include "ext4.h" |
21 | #include "ext4_jbd2.h" | 21 | #include "ext4_jbd2.h" |
22 | #include "group.h" | 22 | #include "group.h" |
23 | #include "mballoc.h" | ||
23 | 24 | ||
24 | /* | 25 | /* |
25 | * balloc.c contains the blocks allocation and deallocation routines | 26 | * balloc.c contains the blocks allocation and deallocation routines |
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | |||
100 | * essentially implementing a per-group read-only flag. */ | 101 | * essentially implementing a per-group read-only flag. */ |
101 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { | 102 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { |
102 | ext4_error(sb, __func__, | 103 | ext4_error(sb, __func__, |
103 | "Checksum bad for group %lu\n", block_group); | 104 | "Checksum bad for group %u", block_group); |
104 | gdp->bg_free_blocks_count = 0; | 105 | ext4_free_blks_set(sb, gdp, 0); |
105 | gdp->bg_free_inodes_count = 0; | 106 | ext4_free_inodes_set(sb, gdp, 0); |
106 | gdp->bg_itable_unused = 0; | 107 | ext4_itable_unused_set(sb, gdp, 0); |
107 | memset(bh->b_data, 0xff, sb->s_blocksize); | 108 | memset(bh->b_data, 0xff, sb->s_blocksize); |
108 | return 0; | 109 | return 0; |
109 | } | 110 | } |
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, | |||
205 | ext4_group_t block_group, | 206 | ext4_group_t block_group, |
206 | struct buffer_head **bh) | 207 | struct buffer_head **bh) |
207 | { | 208 | { |
208 | unsigned long group_desc; | 209 | unsigned int group_desc; |
209 | unsigned long offset; | 210 | unsigned int offset; |
210 | struct ext4_group_desc *desc; | 211 | struct ext4_group_desc *desc; |
211 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 212 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
212 | 213 | ||
213 | if (block_group >= sbi->s_groups_count) { | 214 | if (block_group >= sbi->s_groups_count) { |
214 | ext4_error(sb, "ext4_get_group_desc", | 215 | ext4_error(sb, "ext4_get_group_desc", |
215 | "block_group >= groups_count - " | 216 | "block_group >= groups_count - " |
216 | "block_group = %lu, groups_count = %lu", | 217 | "block_group = %u, groups_count = %u", |
217 | block_group, sbi->s_groups_count); | 218 | block_group, sbi->s_groups_count); |
218 | 219 | ||
219 | return NULL; | 220 | return NULL; |
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, | |||
225 | if (!sbi->s_group_desc[group_desc]) { | 226 | if (!sbi->s_group_desc[group_desc]) { |
226 | ext4_error(sb, "ext4_get_group_desc", | 227 | ext4_error(sb, "ext4_get_group_desc", |
227 | "Group descriptor not loaded - " | 228 | "Group descriptor not loaded - " |
228 | "block_group = %lu, group_desc = %lu, desc = %lu", | 229 | "block_group = %u, group_desc = %u, desc = %u", |
229 | block_group, group_desc, offset); | 230 | block_group, group_desc, offset); |
230 | return NULL; | 231 | return NULL; |
231 | } | 232 | } |
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
315 | if (unlikely(!bh)) { | 316 | if (unlikely(!bh)) { |
316 | ext4_error(sb, __func__, | 317 | ext4_error(sb, __func__, |
317 | "Cannot read block bitmap - " | 318 | "Cannot read block bitmap - " |
318 | "block_group = %lu, block_bitmap = %llu", | 319 | "block_group = %u, block_bitmap = %llu", |
319 | block_group, bitmap_blk); | 320 | block_group, bitmap_blk); |
320 | return NULL; | 321 | return NULL; |
321 | } | 322 | } |
322 | if (buffer_uptodate(bh) && | 323 | |
323 | !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) | 324 | if (bitmap_uptodate(bh)) |
324 | return bh; | 325 | return bh; |
325 | 326 | ||
326 | lock_buffer(bh); | 327 | lock_buffer(bh); |
328 | if (bitmap_uptodate(bh)) { | ||
329 | unlock_buffer(bh); | ||
330 | return bh; | ||
331 | } | ||
327 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 332 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
328 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 333 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
329 | ext4_init_block_bitmap(sb, bh, block_group, desc); | 334 | ext4_init_block_bitmap(sb, bh, block_group, desc); |
335 | set_bitmap_uptodate(bh); | ||
330 | set_buffer_uptodate(bh); | 336 | set_buffer_uptodate(bh); |
331 | unlock_buffer(bh); | ||
332 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 337 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
338 | unlock_buffer(bh); | ||
333 | return bh; | 339 | return bh; |
334 | } | 340 | } |
335 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 341 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
342 | if (buffer_uptodate(bh)) { | ||
343 | /* | ||
344 | * if not uninit if bh is uptodate, | ||
345 | * bitmap is also uptodate | ||
346 | */ | ||
347 | set_bitmap_uptodate(bh); | ||
348 | unlock_buffer(bh); | ||
349 | return bh; | ||
350 | } | ||
351 | /* | ||
352 | * submit the buffer_head for read. We can | ||
353 | * safely mark the bitmap as uptodate now. | ||
354 | * We do it here so the bitmap uptodate bit | ||
355 | * get set with buffer lock held. | ||
356 | */ | ||
357 | set_bitmap_uptodate(bh); | ||
336 | if (bh_submit_read(bh) < 0) { | 358 | if (bh_submit_read(bh) < 0) { |
337 | put_bh(bh); | 359 | put_bh(bh); |
338 | ext4_error(sb, __func__, | 360 | ext4_error(sb, __func__, |
339 | "Cannot read block bitmap - " | 361 | "Cannot read block bitmap - " |
340 | "block_group = %lu, block_bitmap = %llu", | 362 | "block_group = %u, block_bitmap = %llu", |
341 | block_group, bitmap_blk); | 363 | block_group, bitmap_blk); |
342 | return NULL; | 364 | return NULL; |
343 | } | 365 | } |
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
350 | } | 372 | } |
351 | 373 | ||
352 | /** | 374 | /** |
353 | * ext4_free_blocks_sb() -- Free given blocks and update quota | 375 | * ext4_add_groupblocks() -- Add given blocks to an existing group |
354 | * @handle: handle to this transaction | 376 | * @handle: handle to this transaction |
355 | * @sb: super block | 377 | * @sb: super block |
356 | * @block: start physcial block to free | 378 | * @block: start physcial block to add to the block group |
357 | * @count: number of blocks to free | 379 | * @count: number of blocks to free |
358 | * @pdquot_freed_blocks: pointer to quota | ||
359 | * | 380 | * |
360 | * XXX This function is only used by the on-line resizing code, which | 381 | * This marks the blocks as free in the bitmap. We ask the |
361 | * should probably be fixed up to call the mballoc variant. There | 382 | * mballoc to reload the buddy after this by setting group |
362 | * this needs to be cleaned up later; in fact, I'm not convinced this | 383 | * EXT4_GROUP_INFO_NEED_INIT_BIT flag |
363 | * is 100% correct in the face of the mballoc code. The online resizing | ||
364 | * code needs to be fixed up to more tightly (and correctly) interlock | ||
365 | * with the mballoc code. | ||
366 | */ | 384 | */ |
367 | void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, | 385 | void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, |
368 | ext4_fsblk_t block, unsigned long count, | 386 | ext4_fsblk_t block, unsigned long count) |
369 | unsigned long *pdquot_freed_blocks) | ||
370 | { | 387 | { |
371 | struct buffer_head *bitmap_bh = NULL; | 388 | struct buffer_head *bitmap_bh = NULL; |
372 | struct buffer_head *gd_bh; | 389 | struct buffer_head *gd_bh; |
373 | ext4_group_t block_group; | 390 | ext4_group_t block_group; |
374 | ext4_grpblk_t bit; | 391 | ext4_grpblk_t bit; |
375 | unsigned long i; | 392 | unsigned int i; |
376 | unsigned long overflow; | ||
377 | struct ext4_group_desc *desc; | 393 | struct ext4_group_desc *desc; |
378 | struct ext4_super_block *es; | 394 | struct ext4_super_block *es; |
379 | struct ext4_sb_info *sbi; | 395 | struct ext4_sb_info *sbi; |
380 | int err = 0, ret; | 396 | int err = 0, ret, blk_free_count; |
381 | ext4_grpblk_t group_freed; | 397 | ext4_grpblk_t blocks_freed; |
398 | struct ext4_group_info *grp; | ||
382 | 399 | ||
383 | *pdquot_freed_blocks = 0; | ||
384 | sbi = EXT4_SB(sb); | 400 | sbi = EXT4_SB(sb); |
385 | es = sbi->s_es; | 401 | es = sbi->s_es; |
386 | if (block < le32_to_cpu(es->s_first_data_block) || | 402 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); |
387 | block + count < block || | ||
388 | block + count > ext4_blocks_count(es)) { | ||
389 | ext4_error(sb, "ext4_free_blocks", | ||
390 | "Freeing blocks not in datazone - " | ||
391 | "block = %llu, count = %lu", block, count); | ||
392 | goto error_return; | ||
393 | } | ||
394 | |||
395 | ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); | ||
396 | 403 | ||
397 | do_more: | ||
398 | overflow = 0; | ||
399 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | 404 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); |
405 | grp = ext4_get_group_info(sb, block_group); | ||
400 | /* | 406 | /* |
401 | * Check to see if we are freeing blocks across a group | 407 | * Check to see if we are freeing blocks across a group |
402 | * boundary. | 408 | * boundary. |
403 | */ | 409 | */ |
404 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { | 410 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { |
405 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); | 411 | goto error_return; |
406 | count -= overflow; | ||
407 | } | 412 | } |
408 | brelse(bitmap_bh); | ||
409 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); | 413 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
410 | if (!bitmap_bh) | 414 | if (!bitmap_bh) |
411 | goto error_return; | 415 | goto error_return; |
@@ -418,18 +422,17 @@ do_more: | |||
418 | in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || | 422 | in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || |
419 | in_range(block + count - 1, ext4_inode_table(sb, desc), | 423 | in_range(block + count - 1, ext4_inode_table(sb, desc), |
420 | sbi->s_itb_per_group)) { | 424 | sbi->s_itb_per_group)) { |
421 | ext4_error(sb, "ext4_free_blocks", | 425 | ext4_error(sb, __func__, |
422 | "Freeing blocks in system zones - " | 426 | "Adding blocks in system zones - " |
423 | "Block = %llu, count = %lu", | 427 | "Block = %llu, count = %lu", |
424 | block, count); | 428 | block, count); |
425 | goto error_return; | 429 | goto error_return; |
426 | } | 430 | } |
427 | 431 | ||
428 | /* | 432 | /* |
429 | * We are about to start releasing blocks in the bitmap, | 433 | * We are about to add blocks to the bitmap, |
430 | * so we need undo access. | 434 | * so we need undo access. |
431 | */ | 435 | */ |
432 | /* @@@ check errors */ | ||
433 | BUFFER_TRACE(bitmap_bh, "getting undo access"); | 436 | BUFFER_TRACE(bitmap_bh, "getting undo access"); |
434 | err = ext4_journal_get_undo_access(handle, bitmap_bh); | 437 | err = ext4_journal_get_undo_access(handle, bitmap_bh); |
435 | if (err) | 438 | if (err) |
@@ -444,107 +447,55 @@ do_more: | |||
444 | err = ext4_journal_get_write_access(handle, gd_bh); | 447 | err = ext4_journal_get_write_access(handle, gd_bh); |
445 | if (err) | 448 | if (err) |
446 | goto error_return; | 449 | goto error_return; |
447 | 450 | /* | |
448 | jbd_lock_bh_state(bitmap_bh); | 451 | * make sure we don't allow a parallel init on other groups in the |
449 | 452 | * same buddy cache | |
450 | for (i = 0, group_freed = 0; i < count; i++) { | 453 | */ |
451 | /* | 454 | down_write(&grp->alloc_sem); |
452 | * An HJ special. This is expensive... | 455 | for (i = 0, blocks_freed = 0; i < count; i++) { |
453 | */ | ||
454 | #ifdef CONFIG_JBD2_DEBUG | ||
455 | jbd_unlock_bh_state(bitmap_bh); | ||
456 | { | ||
457 | struct buffer_head *debug_bh; | ||
458 | debug_bh = sb_find_get_block(sb, block + i); | ||
459 | if (debug_bh) { | ||
460 | BUFFER_TRACE(debug_bh, "Deleted!"); | ||
461 | if (!bh2jh(bitmap_bh)->b_committed_data) | ||
462 | BUFFER_TRACE(debug_bh, | ||
463 | "No commited data in bitmap"); | ||
464 | BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); | ||
465 | __brelse(debug_bh); | ||
466 | } | ||
467 | } | ||
468 | jbd_lock_bh_state(bitmap_bh); | ||
469 | #endif | ||
470 | if (need_resched()) { | ||
471 | jbd_unlock_bh_state(bitmap_bh); | ||
472 | cond_resched(); | ||
473 | jbd_lock_bh_state(bitmap_bh); | ||
474 | } | ||
475 | /* @@@ This prevents newly-allocated data from being | ||
476 | * freed and then reallocated within the same | ||
477 | * transaction. | ||
478 | * | ||
479 | * Ideally we would want to allow that to happen, but to | ||
480 | * do so requires making jbd2_journal_forget() capable of | ||
481 | * revoking the queued write of a data block, which | ||
482 | * implies blocking on the journal lock. *forget() | ||
483 | * cannot block due to truncate races. | ||
484 | * | ||
485 | * Eventually we can fix this by making jbd2_journal_forget() | ||
486 | * return a status indicating whether or not it was able | ||
487 | * to revoke the buffer. On successful revoke, it is | ||
488 | * safe not to set the allocation bit in the committed | ||
489 | * bitmap, because we know that there is no outstanding | ||
490 | * activity on the buffer any more and so it is safe to | ||
491 | * reallocate it. | ||
492 | */ | ||
493 | BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); | ||
494 | J_ASSERT_BH(bitmap_bh, | ||
495 | bh2jh(bitmap_bh)->b_committed_data != NULL); | ||
496 | ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, | ||
497 | bh2jh(bitmap_bh)->b_committed_data); | ||
498 | |||
499 | /* | ||
500 | * We clear the bit in the bitmap after setting the committed | ||
501 | * data bit, because this is the reverse order to that which | ||
502 | * the allocator uses. | ||
503 | */ | ||
504 | BUFFER_TRACE(bitmap_bh, "clear bit"); | 456 | BUFFER_TRACE(bitmap_bh, "clear bit"); |
505 | if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), | 457 | if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), |
506 | bit + i, bitmap_bh->b_data)) { | 458 | bit + i, bitmap_bh->b_data)) { |
507 | jbd_unlock_bh_state(bitmap_bh); | ||
508 | ext4_error(sb, __func__, | 459 | ext4_error(sb, __func__, |
509 | "bit already cleared for block %llu", | 460 | "bit already cleared for block %llu", |
510 | (ext4_fsblk_t)(block + i)); | 461 | (ext4_fsblk_t)(block + i)); |
511 | jbd_lock_bh_state(bitmap_bh); | ||
512 | BUFFER_TRACE(bitmap_bh, "bit already cleared"); | 462 | BUFFER_TRACE(bitmap_bh, "bit already cleared"); |
513 | } else { | 463 | } else { |
514 | group_freed++; | 464 | blocks_freed++; |
515 | } | 465 | } |
516 | } | 466 | } |
517 | jbd_unlock_bh_state(bitmap_bh); | ||
518 | |||
519 | spin_lock(sb_bgl_lock(sbi, block_group)); | 467 | spin_lock(sb_bgl_lock(sbi, block_group)); |
520 | le16_add_cpu(&desc->bg_free_blocks_count, group_freed); | 468 | blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); |
469 | ext4_free_blks_set(sb, desc, blk_free_count); | ||
521 | desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); | 470 | desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); |
522 | spin_unlock(sb_bgl_lock(sbi, block_group)); | 471 | spin_unlock(sb_bgl_lock(sbi, block_group)); |
523 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | 472 | percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); |
524 | 473 | ||
525 | if (sbi->s_log_groups_per_flex) { | 474 | if (sbi->s_log_groups_per_flex) { |
526 | ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | 475 | ext4_group_t flex_group = ext4_flex_group(sbi, block_group); |
527 | spin_lock(sb_bgl_lock(sbi, flex_group)); | 476 | spin_lock(sb_bgl_lock(sbi, flex_group)); |
528 | sbi->s_flex_groups[flex_group].free_blocks += count; | 477 | sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; |
529 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | 478 | spin_unlock(sb_bgl_lock(sbi, flex_group)); |
530 | } | 479 | } |
480 | /* | ||
481 | * request to reload the buddy with the | ||
482 | * new bitmap information | ||
483 | */ | ||
484 | set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); | ||
485 | ext4_mb_update_group_info(grp, blocks_freed); | ||
486 | up_write(&grp->alloc_sem); | ||
531 | 487 | ||
532 | /* We dirtied the bitmap block */ | 488 | /* We dirtied the bitmap block */ |
533 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | 489 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); |
534 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 490 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
535 | 491 | ||
536 | /* And the group descriptor block */ | 492 | /* And the group descriptor block */ |
537 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | 493 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); |
538 | ret = ext4_journal_dirty_metadata(handle, gd_bh); | 494 | ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); |
539 | if (!err) err = ret; | 495 | if (!err) |
540 | *pdquot_freed_blocks += group_freed; | 496 | err = ret; |
541 | |||
542 | if (overflow && !err) { | ||
543 | block += count; | ||
544 | count = overflow; | ||
545 | goto do_more; | ||
546 | } | ||
547 | sb->s_dirt = 1; | 497 | sb->s_dirt = 1; |
498 | |||
548 | error_return: | 499 | error_return: |
549 | brelse(bitmap_bh); | 500 | brelse(bitmap_bh); |
550 | ext4_std_error(sb, err); | 501 | ext4_std_error(sb, err); |
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) | |||
614 | if (dirty_blocks < 0) { | 565 | if (dirty_blocks < 0) { |
615 | printk(KERN_CRIT "Dirty block accounting " | 566 | printk(KERN_CRIT "Dirty block accounting " |
616 | "went wrong %lld\n", | 567 | "went wrong %lld\n", |
617 | dirty_blocks); | 568 | (long long)dirty_blocks); |
618 | } | 569 | } |
619 | } | 570 | } |
620 | /* Check whether we have space after | 571 | /* Check whether we have space after |
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |||
666 | return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); | 617 | return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); |
667 | } | 618 | } |
668 | 619 | ||
669 | #define EXT4_META_BLOCK 0x1 | ||
670 | |||
671 | static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, | ||
672 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
673 | unsigned long *count, int *errp, int flags) | ||
674 | { | ||
675 | struct ext4_allocation_request ar; | ||
676 | ext4_fsblk_t ret; | ||
677 | |||
678 | memset(&ar, 0, sizeof(ar)); | ||
679 | /* Fill with neighbour allocated blocks */ | ||
680 | |||
681 | ar.inode = inode; | ||
682 | ar.goal = goal; | ||
683 | ar.len = *count; | ||
684 | ar.logical = iblock; | ||
685 | |||
686 | if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) | ||
687 | /* enable in-core preallocation for data block allocation */ | ||
688 | ar.flags = EXT4_MB_HINT_DATA; | ||
689 | else | ||
690 | /* disable in-core preallocation for non-regular files */ | ||
691 | ar.flags = 0; | ||
692 | |||
693 | ret = ext4_mb_new_blocks(handle, &ar, errp); | ||
694 | *count = ar.len; | ||
695 | return ret; | ||
696 | } | ||
697 | |||
698 | /* | 620 | /* |
699 | * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks | 621 | * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks |
700 | * | 622 | * |
701 | * @handle: handle to this transaction | 623 | * @handle: handle to this transaction |
702 | * @inode: file inode | 624 | * @inode: file inode |
703 | * @goal: given target block(filesystem wide) | 625 | * @goal: given target block(filesystem wide) |
704 | * @count: total number of blocks need | 626 | * @count: pointer to total number of blocks needed |
705 | * @errp: error code | 627 | * @errp: error code |
706 | * | 628 | * |
707 | * Return 1st allocated block numberon success, *count stores total account | 629 | * Return 1st allocated block number on success, *count stores total account |
708 | * error stores in errp pointer | 630 | * error stores in errp pointer |
709 | */ | 631 | */ |
710 | ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, | 632 | ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, |
711 | ext4_fsblk_t goal, unsigned long *count, int *errp) | 633 | ext4_fsblk_t goal, unsigned long *count, int *errp) |
712 | { | 634 | { |
635 | struct ext4_allocation_request ar; | ||
713 | ext4_fsblk_t ret; | 636 | ext4_fsblk_t ret; |
714 | ret = do_blk_alloc(handle, inode, 0, goal, | 637 | |
715 | count, errp, EXT4_META_BLOCK); | 638 | memset(&ar, 0, sizeof(ar)); |
639 | /* Fill with neighbour allocated blocks */ | ||
640 | ar.inode = inode; | ||
641 | ar.goal = goal; | ||
642 | ar.len = count ? *count : 1; | ||
643 | |||
644 | ret = ext4_mb_new_blocks(handle, &ar, errp); | ||
645 | if (count) | ||
646 | *count = ar.len; | ||
647 | |||
716 | /* | 648 | /* |
717 | * Account for the allocated meta blocks | 649 | * Account for the allocated meta blocks |
718 | */ | 650 | */ |
719 | if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { | 651 | if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { |
720 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 652 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); |
721 | EXT4_I(inode)->i_allocated_meta_blocks += *count; | 653 | EXT4_I(inode)->i_allocated_meta_blocks += ar.len; |
722 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 654 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
723 | } | 655 | } |
724 | return ret; | 656 | return ret; |
725 | } | 657 | } |
726 | 658 | ||
727 | /* | ||
728 | * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks | ||
729 | * | ||
730 | * @handle: handle to this transaction | ||
731 | * @inode: file inode | ||
732 | * @goal: given target block(filesystem wide) | ||
733 | * @errp: error code | ||
734 | * | ||
735 | * Return allocated block number on success | ||
736 | */ | ||
737 | ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, | ||
738 | ext4_fsblk_t goal, int *errp) | ||
739 | { | ||
740 | unsigned long count = 1; | ||
741 | return ext4_new_meta_blocks(handle, inode, goal, &count, errp); | ||
742 | } | ||
743 | |||
744 | /* | ||
745 | * ext4_new_blocks() -- allocate data blocks | ||
746 | * | ||
747 | * @handle: handle to this transaction | ||
748 | * @inode: file inode | ||
749 | * @goal: given target block(filesystem wide) | ||
750 | * @count: total number of blocks need | ||
751 | * @errp: error code | ||
752 | * | ||
753 | * Return 1st allocated block numberon success, *count stores total account | ||
754 | * error stores in errp pointer | ||
755 | */ | ||
756 | |||
757 | ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | ||
758 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
759 | unsigned long *count, int *errp) | ||
760 | { | ||
761 | return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); | ||
762 | } | ||
763 | |||
764 | /** | 659 | /** |
765 | * ext4_count_free_blocks() -- count filesystem free blocks | 660 | * ext4_count_free_blocks() -- count filesystem free blocks |
766 | * @sb: superblock | 661 | * @sb: superblock |
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
776 | #ifdef EXT4FS_DEBUG | 671 | #ifdef EXT4FS_DEBUG |
777 | struct ext4_super_block *es; | 672 | struct ext4_super_block *es; |
778 | ext4_fsblk_t bitmap_count; | 673 | ext4_fsblk_t bitmap_count; |
779 | unsigned long x; | 674 | unsigned int x; |
780 | struct buffer_head *bitmap_bh = NULL; | 675 | struct buffer_head *bitmap_bh = NULL; |
781 | 676 | ||
782 | es = EXT4_SB(sb)->s_es; | 677 | es = EXT4_SB(sb)->s_es; |
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
796 | continue; | 691 | continue; |
797 | 692 | ||
798 | x = ext4_count_free(bitmap_bh, sb->s_blocksize); | 693 | x = ext4_count_free(bitmap_bh, sb->s_blocksize); |
799 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", | 694 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", |
800 | i, le16_to_cpu(gdp->bg_free_blocks_count), x); | 695 | i, le16_to_cpu(gdp->bg_free_blocks_count), x); |
801 | bitmap_count += x; | 696 | bitmap_count += x; |
802 | } | 697 | } |
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
812 | gdp = ext4_get_group_desc(sb, i, NULL); | 707 | gdp = ext4_get_group_desc(sb, i, NULL); |
813 | if (!gdp) | 708 | if (!gdp) |
814 | continue; | 709 | continue; |
815 | desc_count += le16_to_cpu(gdp->bg_free_blocks_count); | 710 | desc_count += ext4_free_blks_count(sb, gdp); |
816 | } | 711 | } |
817 | 712 | ||
818 | return desc_count; | 713 | return desc_count; |
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 0a7a6663c190..fa3af81ac565 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c | |||
@@ -15,10 +15,9 @@ | |||
15 | 15 | ||
16 | static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; | 16 | static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; |
17 | 17 | ||
18 | unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) | 18 | unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) |
19 | { | 19 | { |
20 | unsigned int i; | 20 | unsigned int i, sum = 0; |
21 | unsigned long sum = 0; | ||
22 | 21 | ||
23 | if (!map) | 22 | if (!map) |
24 | return 0; | 23 | return 0; |
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fed5b610df5a..2df2e40b01af 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c | |||
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) | |||
64 | int ext4_check_dir_entry(const char *function, struct inode *dir, | 64 | int ext4_check_dir_entry(const char *function, struct inode *dir, |
65 | struct ext4_dir_entry_2 *de, | 65 | struct ext4_dir_entry_2 *de, |
66 | struct buffer_head *bh, | 66 | struct buffer_head *bh, |
67 | unsigned long offset) | 67 | unsigned int offset) |
68 | { | 68 | { |
69 | const char *error_msg = NULL; | 69 | const char *error_msg = NULL; |
70 | const int rlen = ext4_rec_len_from_disk(de->rec_len); | 70 | const int rlen = ext4_rec_len_from_disk(de->rec_len); |
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, | |||
84 | if (error_msg != NULL) | 84 | if (error_msg != NULL) |
85 | ext4_error(dir->i_sb, function, | 85 | ext4_error(dir->i_sb, function, |
86 | "bad entry in directory #%lu: %s - " | 86 | "bad entry in directory #%lu: %s - " |
87 | "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", | 87 | "offset=%u, inode=%u, rec_len=%d, name_len=%d", |
88 | dir->i_ino, error_msg, offset, | 88 | dir->i_ino, error_msg, offset, |
89 | (unsigned long) le32_to_cpu(de->inode), | 89 | le32_to_cpu(de->inode), |
90 | rlen, de->name_len); | 90 | rlen, de->name_len); |
91 | return error_msg == NULL ? 1 : 0; | 91 | return error_msg == NULL ? 1 : 0; |
92 | } | 92 | } |
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp, | |||
95 | void *dirent, filldir_t filldir) | 95 | void *dirent, filldir_t filldir) |
96 | { | 96 | { |
97 | int error = 0; | 97 | int error = 0; |
98 | unsigned long offset; | 98 | unsigned int offset; |
99 | int i, stored; | 99 | int i, stored; |
100 | struct ext4_dir_entry_2 *de; | 100 | struct ext4_dir_entry_2 *de; |
101 | struct super_block *sb; | 101 | struct super_block *sb; |
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent, | |||
405 | sb = inode->i_sb; | 405 | sb = inode->i_sb; |
406 | 406 | ||
407 | if (!fname) { | 407 | if (!fname) { |
408 | printk(KERN_ERR "ext4: call_filldir: called with " | 408 | printk(KERN_ERR "EXT4-fs: call_filldir: called with " |
409 | "null fname?!?\n"); | 409 | "null fname?!?\n"); |
410 | return 0; | 410 | return 0; |
411 | } | 411 | } |
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0537c827024..c668e4377d76 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/types.h> | 19 | #include <linux/types.h> |
20 | #include <linux/blkdev.h> | 20 | #include <linux/blkdev.h> |
21 | #include <linux/magic.h> | 21 | #include <linux/magic.h> |
22 | #include <linux/jbd2.h> | ||
22 | #include "ext4_i.h" | 23 | #include "ext4_i.h" |
23 | 24 | ||
24 | /* | 25 | /* |
@@ -94,9 +95,9 @@ struct ext4_allocation_request { | |||
94 | /* phys. block for ^^^ */ | 95 | /* phys. block for ^^^ */ |
95 | ext4_fsblk_t pright; | 96 | ext4_fsblk_t pright; |
96 | /* how many blocks we want to allocate */ | 97 | /* how many blocks we want to allocate */ |
97 | unsigned long len; | 98 | unsigned int len; |
98 | /* flags. see above EXT4_MB_HINT_* */ | 99 | /* flags. see above EXT4_MB_HINT_* */ |
99 | unsigned long flags; | 100 | unsigned int flags; |
100 | }; | 101 | }; |
101 | 102 | ||
102 | /* | 103 | /* |
@@ -156,12 +157,12 @@ struct ext4_group_desc | |||
156 | __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ | 157 | __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ |
157 | __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ | 158 | __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ |
158 | __le32 bg_inode_table_lo; /* Inodes table block */ | 159 | __le32 bg_inode_table_lo; /* Inodes table block */ |
159 | __le16 bg_free_blocks_count; /* Free blocks count */ | 160 | __le16 bg_free_blocks_count_lo;/* Free blocks count */ |
160 | __le16 bg_free_inodes_count; /* Free inodes count */ | 161 | __le16 bg_free_inodes_count_lo;/* Free inodes count */ |
161 | __le16 bg_used_dirs_count; /* Directories count */ | 162 | __le16 bg_used_dirs_count_lo; /* Directories count */ |
162 | __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ | 163 | __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ |
163 | __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ | 164 | __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ |
164 | __le16 bg_itable_unused; /* Unused inodes count */ | 165 | __le16 bg_itable_unused_lo; /* Unused inodes count */ |
165 | __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ | 166 | __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ |
166 | __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ | 167 | __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ |
167 | __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ | 168 | __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ |
@@ -169,7 +170,7 @@ struct ext4_group_desc | |||
169 | __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ | 170 | __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ |
170 | __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ | 171 | __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ |
171 | __le16 bg_used_dirs_count_hi; /* Directories count MSB */ | 172 | __le16 bg_used_dirs_count_hi; /* Directories count MSB */ |
172 | __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ | 173 | __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ |
173 | __u32 bg_reserved2[3]; | 174 | __u32 bg_reserved2[3]; |
174 | }; | 175 | }; |
175 | 176 | ||
@@ -328,6 +329,7 @@ struct ext4_mount_options { | |||
328 | uid_t s_resuid; | 329 | uid_t s_resuid; |
329 | gid_t s_resgid; | 330 | gid_t s_resgid; |
330 | unsigned long s_commit_interval; | 331 | unsigned long s_commit_interval; |
332 | u32 s_min_batch_time, s_max_batch_time; | ||
331 | #ifdef CONFIG_QUOTA | 333 | #ifdef CONFIG_QUOTA |
332 | int s_jquota_fmt; | 334 | int s_jquota_fmt; |
333 | char *s_qf_names[MAXQUOTAS]; | 335 | char *s_qf_names[MAXQUOTAS]; |
@@ -534,7 +536,6 @@ do { \ | |||
534 | #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ | 536 | #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ |
535 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ | 537 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ |
536 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ | 538 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ |
537 | #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ | ||
538 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ | 539 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ |
539 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | 540 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ |
540 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | 541 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ |
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
726 | */ | 727 | */ |
727 | 728 | ||
728 | #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ | 729 | #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ |
729 | (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) | 730 | ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) |
730 | #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ | 731 | #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ |
731 | (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) | 732 | ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) |
732 | #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ | 733 | #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ |
733 | (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) | 734 | ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) |
734 | #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ | 735 | #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ |
735 | EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) | 736 | EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) |
736 | #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ | 737 | #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ |
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
806 | #define EXT4_DEFM_JMODE_WBACK 0x0060 | 807 | #define EXT4_DEFM_JMODE_WBACK 0x0060 |
807 | 808 | ||
808 | /* | 809 | /* |
810 | * Default journal batch times | ||
811 | */ | ||
812 | #define EXT4_DEF_MIN_BATCH_TIME 0 | ||
813 | #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ | ||
814 | |||
815 | /* | ||
809 | * Structure of a directory entry | 816 | * Structure of a directory entry |
810 | */ | 817 | */ |
811 | #define EXT4_NAME_LEN 255 | 818 | #define EXT4_NAME_LEN 255 |
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len) | |||
891 | #define DX_HASH_LEGACY 0 | 898 | #define DX_HASH_LEGACY 0 |
892 | #define DX_HASH_HALF_MD4 1 | 899 | #define DX_HASH_HALF_MD4 1 |
893 | #define DX_HASH_TEA 2 | 900 | #define DX_HASH_TEA 2 |
901 | #define DX_HASH_LEGACY_UNSIGNED 3 | ||
902 | #define DX_HASH_HALF_MD4_UNSIGNED 4 | ||
903 | #define DX_HASH_TEA_UNSIGNED 5 | ||
894 | 904 | ||
895 | #ifdef __KERNEL__ | 905 | #ifdef __KERNEL__ |
896 | 906 | ||
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) | |||
955 | #define ERR_BAD_DX_DIR -75000 | 965 | #define ERR_BAD_DX_DIR -75000 |
956 | 966 | ||
957 | void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, | 967 | void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, |
958 | unsigned long *blockgrpp, ext4_grpblk_t *offsetp); | 968 | ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); |
959 | 969 | ||
960 | extern struct proc_dir_entry *ext4_proc_root; | 970 | extern struct proc_dir_entry *ext4_proc_root; |
961 | 971 | ||
@@ -987,6 +997,9 @@ do { \ | |||
987 | # define ATTRIB_NORET __attribute__((noreturn)) | 997 | # define ATTRIB_NORET __attribute__((noreturn)) |
988 | # define NORET_AND noreturn, | 998 | # define NORET_AND noreturn, |
989 | 999 | ||
1000 | /* bitmap.c */ | ||
1001 | extern unsigned int ext4_count_free(struct buffer_head *, unsigned); | ||
1002 | |||
990 | /* balloc.c */ | 1003 | /* balloc.c */ |
991 | extern unsigned int ext4_block_group(struct super_block *sb, | 1004 | extern unsigned int ext4_block_group(struct super_block *sb, |
992 | ext4_fsblk_t blocknr); | 1005 | ext4_fsblk_t blocknr); |
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, | |||
995 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); | 1008 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); |
996 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, | 1009 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, |
997 | ext4_group_t group); | 1010 | ext4_group_t group); |
998 | extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, | ||
999 | ext4_fsblk_t goal, int *errp); | ||
1000 | extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, | 1011 | extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, |
1001 | ext4_fsblk_t goal, unsigned long *count, int *errp); | 1012 | ext4_fsblk_t goal, unsigned long *count, int *errp); |
1002 | extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | ||
1003 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
1004 | unsigned long *count, int *errp); | ||
1005 | extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); | 1013 | extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); |
1006 | extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); | 1014 | extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); |
1007 | extern void ext4_free_blocks(handle_t *handle, struct inode *inode, | 1015 | extern void ext4_free_blocks(handle_t *handle, struct inode *inode, |
1008 | ext4_fsblk_t block, unsigned long count, int metadata); | 1016 | ext4_fsblk_t block, unsigned long count, int metadata); |
1009 | extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, | 1017 | extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, |
1010 | ext4_fsblk_t block, unsigned long count, | 1018 | ext4_fsblk_t block, unsigned long count); |
1011 | unsigned long *pdquot_freed_blocks); | ||
1012 | extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); | 1019 | extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); |
1013 | extern void ext4_check_blocks_bitmap(struct super_block *); | 1020 | extern void ext4_check_blocks_bitmap(struct super_block *); |
1014 | extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | 1021 | extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, |
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); | |||
1019 | /* dir.c */ | 1026 | /* dir.c */ |
1020 | extern int ext4_check_dir_entry(const char *, struct inode *, | 1027 | extern int ext4_check_dir_entry(const char *, struct inode *, |
1021 | struct ext4_dir_entry_2 *, | 1028 | struct ext4_dir_entry_2 *, |
1022 | struct buffer_head *, unsigned long); | 1029 | struct buffer_head *, unsigned int); |
1023 | extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, | 1030 | extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, |
1024 | __u32 minor_hash, | 1031 | __u32 minor_hash, |
1025 | struct ext4_dir_entry_2 *dirent); | 1032 | struct ext4_dir_entry_2 *dirent); |
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); | |||
1039 | extern unsigned long ext4_count_free_inodes(struct super_block *); | 1046 | extern unsigned long ext4_count_free_inodes(struct super_block *); |
1040 | extern unsigned long ext4_count_dirs(struct super_block *); | 1047 | extern unsigned long ext4_count_dirs(struct super_block *); |
1041 | extern void ext4_check_inodes_bitmap(struct super_block *); | 1048 | extern void ext4_check_inodes_bitmap(struct super_block *); |
1042 | extern unsigned long ext4_count_free(struct buffer_head *, unsigned); | ||
1043 | 1049 | ||
1044 | /* mballoc.c */ | 1050 | /* mballoc.c */ |
1045 | extern long ext4_mb_stats; | 1051 | extern long ext4_mb_stats; |
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void); | |||
1054 | extern void exit_ext4_mballoc(void); | 1060 | extern void exit_ext4_mballoc(void); |
1055 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, | 1061 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, |
1056 | unsigned long, unsigned long, int, unsigned long *); | 1062 | unsigned long, unsigned long, int, unsigned long *); |
1057 | extern int ext4_mb_add_more_groupinfo(struct super_block *sb, | 1063 | extern int ext4_mb_add_groupinfo(struct super_block *sb, |
1058 | ext4_group_t i, struct ext4_group_desc *desc); | 1064 | ext4_group_t i, struct ext4_group_desc *desc); |
1059 | extern void ext4_mb_update_group_info(struct ext4_group_info *grp, | 1065 | extern void ext4_mb_update_group_info(struct ext4_group_info *grp, |
1060 | ext4_grpblk_t add); | 1066 | ext4_grpblk_t add); |
1061 | 1067 | extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); | |
1062 | 1068 | extern void ext4_mb_put_buddy_cache_lock(struct super_block *, | |
1069 | ext4_group_t, int); | ||
1063 | /* inode.c */ | 1070 | /* inode.c */ |
1064 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | 1071 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, |
1065 | struct buffer_head *bh, ext4_fsblk_t blocknr); | 1072 | struct buffer_head *bh, ext4_fsblk_t blocknr); |
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *, | |||
1069 | ext4_lblk_t, int, int *); | 1076 | ext4_lblk_t, int, int *); |
1070 | int ext4_get_block(struct inode *inode, sector_t iblock, | 1077 | int ext4_get_block(struct inode *inode, sector_t iblock, |
1071 | struct buffer_head *bh_result, int create); | 1078 | struct buffer_head *bh_result, int create); |
1072 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | ||
1073 | ext4_lblk_t iblock, unsigned long maxblocks, | ||
1074 | struct buffer_head *bh_result, | ||
1075 | int create, int extend_disksize); | ||
1076 | 1079 | ||
1077 | extern struct inode *ext4_iget(struct super_block *, unsigned long); | 1080 | extern struct inode *ext4_iget(struct super_block *, unsigned long); |
1078 | extern int ext4_write_inode(struct inode *, int); | 1081 | extern int ext4_write_inode(struct inode *, int); |
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...) | |||
1123 | __attribute__ ((format (printf, 3, 4))); | 1126 | __attribute__ ((format (printf, 3, 4))); |
1124 | extern void ext4_warning(struct super_block *, const char *, const char *, ...) | 1127 | extern void ext4_warning(struct super_block *, const char *, const char *, ...) |
1125 | __attribute__ ((format (printf, 3, 4))); | 1128 | __attribute__ ((format (printf, 3, 4))); |
1129 | extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, | ||
1130 | const char *, const char *, ...) | ||
1131 | __attribute__ ((format (printf, 4, 5))); | ||
1126 | extern void ext4_update_dynamic_rev(struct super_block *sb); | 1132 | extern void ext4_update_dynamic_rev(struct super_block *sb); |
1127 | extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, | 1133 | extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, |
1128 | __u32 compat); | 1134 | __u32 compat); |
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, | |||
1136 | struct ext4_group_desc *bg); | 1142 | struct ext4_group_desc *bg); |
1137 | extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, | 1143 | extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, |
1138 | struct ext4_group_desc *bg); | 1144 | struct ext4_group_desc *bg); |
1145 | extern __u32 ext4_free_blks_count(struct super_block *sb, | ||
1146 | struct ext4_group_desc *bg); | ||
1147 | extern __u32 ext4_free_inodes_count(struct super_block *sb, | ||
1148 | struct ext4_group_desc *bg); | ||
1149 | extern __u32 ext4_used_dirs_count(struct super_block *sb, | ||
1150 | struct ext4_group_desc *bg); | ||
1151 | extern __u32 ext4_itable_unused_count(struct super_block *sb, | ||
1152 | struct ext4_group_desc *bg); | ||
1139 | extern void ext4_block_bitmap_set(struct super_block *sb, | 1153 | extern void ext4_block_bitmap_set(struct super_block *sb, |
1140 | struct ext4_group_desc *bg, ext4_fsblk_t blk); | 1154 | struct ext4_group_desc *bg, ext4_fsblk_t blk); |
1141 | extern void ext4_inode_bitmap_set(struct super_block *sb, | 1155 | extern void ext4_inode_bitmap_set(struct super_block *sb, |
1142 | struct ext4_group_desc *bg, ext4_fsblk_t blk); | 1156 | struct ext4_group_desc *bg, ext4_fsblk_t blk); |
1143 | extern void ext4_inode_table_set(struct super_block *sb, | 1157 | extern void ext4_inode_table_set(struct super_block *sb, |
1144 | struct ext4_group_desc *bg, ext4_fsblk_t blk); | 1158 | struct ext4_group_desc *bg, ext4_fsblk_t blk); |
1159 | extern void ext4_free_blks_set(struct super_block *sb, | ||
1160 | struct ext4_group_desc *bg, __u32 count); | ||
1161 | extern void ext4_free_inodes_set(struct super_block *sb, | ||
1162 | struct ext4_group_desc *bg, __u32 count); | ||
1163 | extern void ext4_used_dirs_set(struct super_block *sb, | ||
1164 | struct ext4_group_desc *bg, __u32 count); | ||
1165 | extern void ext4_itable_unused_set(struct super_block *sb, | ||
1166 | struct ext4_group_desc *bg, __u32 count); | ||
1145 | 1167 | ||
1146 | static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) | 1168 | static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) |
1147 | { | 1169 | { |
@@ -1225,11 +1247,11 @@ do { \ | |||
1225 | } while (0) | 1247 | } while (0) |
1226 | 1248 | ||
1227 | #ifdef CONFIG_SMP | 1249 | #ifdef CONFIG_SMP |
1228 | /* Each CPU can accumulate FBC_BATCH blocks in their local | 1250 | /* Each CPU can accumulate percpu_counter_batch blocks in their local |
1229 | * counters. So we need to make sure we have free blocks more | 1251 | * counters. So we need to make sure we have free blocks more |
1230 | * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. | 1252 | * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. |
1231 | */ | 1253 | */ |
1232 | #define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) | 1254 | #define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) |
1233 | #else | 1255 | #else |
1234 | #define EXT4_FREEBLOCKS_WATERMARK 0 | 1256 | #define EXT4_FREEBLOCKS_WATERMARK 0 |
1235 | #endif | 1257 | #endif |
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) | |||
1246 | return ; | 1268 | return ; |
1247 | } | 1269 | } |
1248 | 1270 | ||
1271 | struct ext4_group_info { | ||
1272 | unsigned long bb_state; | ||
1273 | struct rb_root bb_free_root; | ||
1274 | unsigned short bb_first_free; | ||
1275 | unsigned short bb_free; | ||
1276 | unsigned short bb_fragments; | ||
1277 | struct list_head bb_prealloc_list; | ||
1278 | #ifdef DOUBLE_CHECK | ||
1279 | void *bb_bitmap; | ||
1280 | #endif | ||
1281 | struct rw_semaphore alloc_sem; | ||
1282 | unsigned short bb_counters[]; | ||
1283 | }; | ||
1284 | |||
1285 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | ||
1286 | #define EXT4_GROUP_INFO_LOCKED_BIT 1 | ||
1287 | |||
1288 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | ||
1289 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | ||
1290 | |||
1291 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
1292 | { | ||
1293 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
1294 | |||
1295 | bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
1296 | } | ||
1297 | |||
1298 | static inline void ext4_unlock_group(struct super_block *sb, | ||
1299 | ext4_group_t group) | ||
1300 | { | ||
1301 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
1302 | |||
1303 | bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
1304 | } | ||
1305 | |||
1306 | static inline int ext4_is_group_locked(struct super_block *sb, | ||
1307 | ext4_group_t group) | ||
1308 | { | ||
1309 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
1310 | |||
1311 | return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, | ||
1312 | &(grinfo->bb_state)); | ||
1313 | } | ||
1314 | |||
1249 | /* | 1315 | /* |
1250 | * Inodes and files operations | 1316 | * Inodes and files operations |
1251 | */ | 1317 | */ |
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); | |||
1271 | extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, | 1337 | extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, |
1272 | int chunk); | 1338 | int chunk); |
1273 | extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | 1339 | extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1274 | ext4_lblk_t iblock, | 1340 | ext4_lblk_t iblock, unsigned int max_blocks, |
1275 | unsigned long max_blocks, struct buffer_head *bh_result, | 1341 | struct buffer_head *bh_result, |
1276 | int create, int extend_disksize); | 1342 | int create, int extend_disksize); |
1277 | extern void ext4_ext_truncate(struct inode *); | 1343 | extern void ext4_ext_truncate(struct inode *); |
1278 | extern void ext4_ext_init(struct super_block *); | 1344 | extern void ext4_ext_init(struct super_block *); |
1279 | extern void ext4_ext_release(struct super_block *); | 1345 | extern void ext4_ext_release(struct super_block *); |
1280 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | 1346 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, |
1281 | loff_t len); | 1347 | loff_t len); |
1282 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, | 1348 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, |
1283 | sector_t block, unsigned long max_blocks, | 1349 | sector_t block, unsigned int max_blocks, |
1284 | struct buffer_head *bh, int create, | 1350 | struct buffer_head *bh, int create, |
1285 | int extend_disksize, int flag); | 1351 | int extend_disksize, int flag); |
1352 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | ||
1353 | __u64 start, __u64 len); | ||
1354 | |||
1355 | /* | ||
1356 | * Add new method to test wether block and inode bitmaps are properly | ||
1357 | * initialized. With uninit_bg reading the block from disk is not enough | ||
1358 | * to mark the bitmap uptodate. We need to also zero-out the bitmap | ||
1359 | */ | ||
1360 | #define BH_BITMAP_UPTODATE BH_JBDPrivateStart | ||
1361 | |||
1362 | static inline int bitmap_uptodate(struct buffer_head *bh) | ||
1363 | { | ||
1364 | return (buffer_uptodate(bh) && | ||
1365 | test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); | ||
1366 | } | ||
1367 | static inline void set_bitmap_uptodate(struct buffer_head *bh) | ||
1368 | { | ||
1369 | set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); | ||
1370 | } | ||
1371 | |||
1286 | #endif /* __KERNEL__ */ | 1372 | #endif /* __KERNEL__ */ |
1287 | 1373 | ||
1288 | #endif /* _EXT4_H */ | 1374 | #endif /* _EXT4_H */ |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bec7ce59fc0d..18cb67b2cbbc 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode) | |||
194 | return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); | 194 | return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); |
195 | } | 195 | } |
196 | 196 | ||
197 | static inline void ext4_ext_tree_changed(struct inode *inode) | ||
198 | { | ||
199 | EXT4_I(inode)->i_ext_generation++; | ||
200 | } | ||
201 | |||
202 | static inline void | 197 | static inline void |
203 | ext4_ext_invalidate_cache(struct inode *inode) | 198 | ext4_ext_invalidate_cache(struct inode *inode) |
204 | { | 199 | { |
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 5c124c0ac6d3..e69acc16f5c4 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h | |||
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t; | |||
31 | typedef __u32 ext4_lblk_t; | 31 | typedef __u32 ext4_lblk_t; |
32 | 32 | ||
33 | /* data type for block group number */ | 33 | /* data type for block group number */ |
34 | typedef unsigned long ext4_group_t; | 34 | typedef unsigned int ext4_group_t; |
35 | 35 | ||
36 | #define rsv_start rsv_window._rsv_start | 36 | #define rsv_start rsv_window._rsv_start |
37 | #define rsv_end rsv_window._rsv_end | 37 | #define rsv_end rsv_window._rsv_end |
@@ -100,9 +100,6 @@ struct ext4_inode_info { | |||
100 | */ | 100 | */ |
101 | loff_t i_disksize; | 101 | loff_t i_disksize; |
102 | 102 | ||
103 | /* on-disk additional length */ | ||
104 | __u16 i_extra_isize; | ||
105 | |||
106 | /* | 103 | /* |
107 | * i_data_sem is for serialising ext4_truncate() against | 104 | * i_data_sem is for serialising ext4_truncate() against |
108 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's | 105 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's |
@@ -117,7 +114,6 @@ struct ext4_inode_info { | |||
117 | struct inode vfs_inode; | 114 | struct inode vfs_inode; |
118 | struct jbd2_inode jinode; | 115 | struct jbd2_inode jinode; |
119 | 116 | ||
120 | unsigned long i_ext_generation; | ||
121 | struct ext4_ext_cache i_cached_extent; | 117 | struct ext4_ext_cache i_cached_extent; |
122 | /* | 118 | /* |
123 | * File creation time. Its function is same as that of | 119 | * File creation time. Its function is same as that of |
@@ -130,10 +126,14 @@ struct ext4_inode_info { | |||
130 | spinlock_t i_prealloc_lock; | 126 | spinlock_t i_prealloc_lock; |
131 | 127 | ||
132 | /* allocation reservation info for delalloc */ | 128 | /* allocation reservation info for delalloc */ |
133 | unsigned long i_reserved_data_blocks; | 129 | unsigned int i_reserved_data_blocks; |
134 | unsigned long i_reserved_meta_blocks; | 130 | unsigned int i_reserved_meta_blocks; |
135 | unsigned long i_allocated_meta_blocks; | 131 | unsigned int i_allocated_meta_blocks; |
136 | unsigned short i_delalloc_reserved_flag; | 132 | unsigned short i_delalloc_reserved_flag; |
133 | |||
134 | /* on-disk additional length */ | ||
135 | __u16 i_extra_isize; | ||
136 | |||
137 | spinlock_t i_block_reservation_lock; | 137 | spinlock_t i_block_reservation_lock; |
138 | }; | 138 | }; |
139 | 139 | ||
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c75384b34f2c..ad13a84644e1 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c | |||
@@ -7,53 +7,96 @@ | |||
7 | int __ext4_journal_get_undo_access(const char *where, handle_t *handle, | 7 | int __ext4_journal_get_undo_access(const char *where, handle_t *handle, |
8 | struct buffer_head *bh) | 8 | struct buffer_head *bh) |
9 | { | 9 | { |
10 | int err = jbd2_journal_get_undo_access(handle, bh); | 10 | int err = 0; |
11 | if (err) | 11 | |
12 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 12 | if (ext4_handle_valid(handle)) { |
13 | err = jbd2_journal_get_undo_access(handle, bh); | ||
14 | if (err) | ||
15 | ext4_journal_abort_handle(where, __func__, bh, | ||
16 | handle, err); | ||
17 | } | ||
13 | return err; | 18 | return err; |
14 | } | 19 | } |
15 | 20 | ||
16 | int __ext4_journal_get_write_access(const char *where, handle_t *handle, | 21 | int __ext4_journal_get_write_access(const char *where, handle_t *handle, |
17 | struct buffer_head *bh) | 22 | struct buffer_head *bh) |
18 | { | 23 | { |
19 | int err = jbd2_journal_get_write_access(handle, bh); | 24 | int err = 0; |
20 | if (err) | 25 | |
21 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 26 | if (ext4_handle_valid(handle)) { |
27 | err = jbd2_journal_get_write_access(handle, bh); | ||
28 | if (err) | ||
29 | ext4_journal_abort_handle(where, __func__, bh, | ||
30 | handle, err); | ||
31 | } | ||
22 | return err; | 32 | return err; |
23 | } | 33 | } |
24 | 34 | ||
25 | int __ext4_journal_forget(const char *where, handle_t *handle, | 35 | int __ext4_journal_forget(const char *where, handle_t *handle, |
26 | struct buffer_head *bh) | 36 | struct buffer_head *bh) |
27 | { | 37 | { |
28 | int err = jbd2_journal_forget(handle, bh); | 38 | int err = 0; |
29 | if (err) | 39 | |
30 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 40 | if (ext4_handle_valid(handle)) { |
41 | err = jbd2_journal_forget(handle, bh); | ||
42 | if (err) | ||
43 | ext4_journal_abort_handle(where, __func__, bh, | ||
44 | handle, err); | ||
45 | } | ||
31 | return err; | 46 | return err; |
32 | } | 47 | } |
33 | 48 | ||
34 | int __ext4_journal_revoke(const char *where, handle_t *handle, | 49 | int __ext4_journal_revoke(const char *where, handle_t *handle, |
35 | ext4_fsblk_t blocknr, struct buffer_head *bh) | 50 | ext4_fsblk_t blocknr, struct buffer_head *bh) |
36 | { | 51 | { |
37 | int err = jbd2_journal_revoke(handle, blocknr, bh); | 52 | int err = 0; |
38 | if (err) | 53 | |
39 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 54 | if (ext4_handle_valid(handle)) { |
55 | err = jbd2_journal_revoke(handle, blocknr, bh); | ||
56 | if (err) | ||
57 | ext4_journal_abort_handle(where, __func__, bh, | ||
58 | handle, err); | ||
59 | } | ||
40 | return err; | 60 | return err; |
41 | } | 61 | } |
42 | 62 | ||
43 | int __ext4_journal_get_create_access(const char *where, | 63 | int __ext4_journal_get_create_access(const char *where, |
44 | handle_t *handle, struct buffer_head *bh) | 64 | handle_t *handle, struct buffer_head *bh) |
45 | { | 65 | { |
46 | int err = jbd2_journal_get_create_access(handle, bh); | 66 | int err = 0; |
47 | if (err) | 67 | |
48 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 68 | if (ext4_handle_valid(handle)) { |
69 | err = jbd2_journal_get_create_access(handle, bh); | ||
70 | if (err) | ||
71 | ext4_journal_abort_handle(where, __func__, bh, | ||
72 | handle, err); | ||
73 | } | ||
49 | return err; | 74 | return err; |
50 | } | 75 | } |
51 | 76 | ||
52 | int __ext4_journal_dirty_metadata(const char *where, | 77 | int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, |
53 | handle_t *handle, struct buffer_head *bh) | 78 | struct inode *inode, struct buffer_head *bh) |
54 | { | 79 | { |
55 | int err = jbd2_journal_dirty_metadata(handle, bh); | 80 | int err = 0; |
56 | if (err) | 81 | |
57 | ext4_journal_abort_handle(where, __func__, bh, handle, err); | 82 | if (ext4_handle_valid(handle)) { |
83 | err = jbd2_journal_dirty_metadata(handle, bh); | ||
84 | if (err) | ||
85 | ext4_journal_abort_handle(where, __func__, bh, | ||
86 | handle, err); | ||
87 | } else { | ||
88 | mark_buffer_dirty(bh); | ||
89 | if (inode && inode_needs_sync(inode)) { | ||
90 | sync_dirty_buffer(bh); | ||
91 | if (buffer_req(bh) && !buffer_uptodate(bh)) { | ||
92 | ext4_error(inode->i_sb, __func__, | ||
93 | "IO error syncing inode, " | ||
94 | "inode=%lu, block=%llu", | ||
95 | inode->i_ino, | ||
96 | (unsigned long long) bh->b_blocknr); | ||
97 | err = -EIO; | ||
98 | } | ||
99 | } | ||
100 | } | ||
58 | return err; | 101 | return err; |
59 | } | 102 | } |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b455c685a98b..be2f426f6805 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -32,8 +32,8 @@ | |||
32 | * 5 levels of tree + root which are stored in the inode. */ | 32 | * 5 levels of tree + root which are stored in the inode. */ |
33 | 33 | ||
34 | #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ | 34 | #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ |
35 | (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ | 35 | (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ |
36 | || test_opt(sb, EXTENTS) ? 27U : 8U) | 36 | ? 27U : 8U) |
37 | 37 | ||
38 | /* Extended attribute operations touch at most two data buffers, | 38 | /* Extended attribute operations touch at most two data buffers, |
39 | * two bitmap buffers, and two group summaries, in addition to the inode | 39 | * two bitmap buffers, and two group summaries, in addition to the inode |
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); | |||
122 | * been done yet. | 122 | * been done yet. |
123 | */ | 123 | */ |
124 | 124 | ||
125 | static inline void ext4_journal_release_buffer(handle_t *handle, | ||
126 | struct buffer_head *bh) | ||
127 | { | ||
128 | jbd2_journal_release_buffer(handle, bh); | ||
129 | } | ||
130 | |||
131 | void ext4_journal_abort_handle(const char *caller, const char *err_fn, | 125 | void ext4_journal_abort_handle(const char *caller, const char *err_fn, |
132 | struct buffer_head *bh, handle_t *handle, int err); | 126 | struct buffer_head *bh, handle_t *handle, int err); |
133 | 127 | ||
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, | |||
146 | int __ext4_journal_get_create_access(const char *where, | 140 | int __ext4_journal_get_create_access(const char *where, |
147 | handle_t *handle, struct buffer_head *bh); | 141 | handle_t *handle, struct buffer_head *bh); |
148 | 142 | ||
149 | int __ext4_journal_dirty_metadata(const char *where, | 143 | int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, |
150 | handle_t *handle, struct buffer_head *bh); | 144 | struct inode *inode, struct buffer_head *bh); |
151 | 145 | ||
152 | #define ext4_journal_get_undo_access(handle, bh) \ | 146 | #define ext4_journal_get_undo_access(handle, bh) \ |
153 | __ext4_journal_get_undo_access(__func__, (handle), (bh)) | 147 | __ext4_journal_get_undo_access(__func__, (handle), (bh)) |
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where, | |||
157 | __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) | 151 | __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) |
158 | #define ext4_journal_get_create_access(handle, bh) \ | 152 | #define ext4_journal_get_create_access(handle, bh) \ |
159 | __ext4_journal_get_create_access(__func__, (handle), (bh)) | 153 | __ext4_journal_get_create_access(__func__, (handle), (bh)) |
160 | #define ext4_journal_dirty_metadata(handle, bh) \ | ||
161 | __ext4_journal_dirty_metadata(__func__, (handle), (bh)) | ||
162 | #define ext4_journal_forget(handle, bh) \ | 154 | #define ext4_journal_forget(handle, bh) \ |
163 | __ext4_journal_forget(__func__, (handle), (bh)) | 155 | __ext4_journal_forget(__func__, (handle), (bh)) |
156 | #define ext4_handle_dirty_metadata(handle, inode, bh) \ | ||
157 | __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) | ||
164 | 158 | ||
165 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); | 159 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); |
166 | int __ext4_journal_stop(const char *where, handle_t *handle); | 160 | int __ext4_journal_stop(const char *where, handle_t *handle); |
167 | 161 | ||
162 | #define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) | ||
163 | |||
164 | static inline int ext4_handle_valid(handle_t *handle) | ||
165 | { | ||
166 | if (handle == EXT4_NOJOURNAL_HANDLE) | ||
167 | return 0; | ||
168 | return 1; | ||
169 | } | ||
170 | |||
171 | static inline void ext4_handle_sync(handle_t *handle) | ||
172 | { | ||
173 | if (ext4_handle_valid(handle)) | ||
174 | handle->h_sync = 1; | ||
175 | } | ||
176 | |||
177 | static inline void ext4_handle_release_buffer(handle_t *handle, | ||
178 | struct buffer_head *bh) | ||
179 | { | ||
180 | if (ext4_handle_valid(handle)) | ||
181 | jbd2_journal_release_buffer(handle, bh); | ||
182 | } | ||
183 | |||
184 | static inline int ext4_handle_is_aborted(handle_t *handle) | ||
185 | { | ||
186 | if (ext4_handle_valid(handle)) | ||
187 | return is_handle_aborted(handle); | ||
188 | return 0; | ||
189 | } | ||
190 | |||
191 | static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) | ||
192 | { | ||
193 | if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) | ||
194 | return 0; | ||
195 | return 1; | ||
196 | } | ||
197 | |||
198 | static inline void ext4_journal_release_buffer(handle_t *handle, | ||
199 | struct buffer_head *bh) | ||
200 | { | ||
201 | if (ext4_handle_valid(handle)) | ||
202 | jbd2_journal_release_buffer(handle, bh); | ||
203 | } | ||
204 | |||
168 | static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) | 205 | static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) |
169 | { | 206 | { |
170 | return ext4_journal_start_sb(inode->i_sb, nblocks); | 207 | return ext4_journal_start_sb(inode->i_sb, nblocks); |
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void) | |||
180 | 217 | ||
181 | static inline int ext4_journal_extend(handle_t *handle, int nblocks) | 218 | static inline int ext4_journal_extend(handle_t *handle, int nblocks) |
182 | { | 219 | { |
183 | return jbd2_journal_extend(handle, nblocks); | 220 | if (ext4_handle_valid(handle)) |
221 | return jbd2_journal_extend(handle, nblocks); | ||
222 | return 0; | ||
184 | } | 223 | } |
185 | 224 | ||
186 | static inline int ext4_journal_restart(handle_t *handle, int nblocks) | 225 | static inline int ext4_journal_restart(handle_t *handle, int nblocks) |
187 | { | 226 | { |
188 | return jbd2_journal_restart(handle, nblocks); | 227 | if (ext4_handle_valid(handle)) |
228 | return jbd2_journal_restart(handle, nblocks); | ||
229 | return 0; | ||
189 | } | 230 | } |
190 | 231 | ||
191 | static inline int ext4_journal_blocks_per_page(struct inode *inode) | 232 | static inline int ext4_journal_blocks_per_page(struct inode *inode) |
192 | { | 233 | { |
193 | return jbd2_journal_blocks_per_page(inode); | 234 | if (EXT4_JOURNAL(inode) != NULL) |
235 | return jbd2_journal_blocks_per_page(inode); | ||
236 | return 0; | ||
194 | } | 237 | } |
195 | 238 | ||
196 | static inline int ext4_journal_force_commit(journal_t *journal) | 239 | static inline int ext4_journal_force_commit(journal_t *journal) |
197 | { | 240 | { |
198 | return jbd2_journal_force_commit(journal); | 241 | if (journal) |
242 | return jbd2_journal_force_commit(journal); | ||
243 | return 0; | ||
199 | } | 244 | } |
200 | 245 | ||
201 | static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | 246 | static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) |
202 | { | 247 | { |
203 | return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); | 248 | if (ext4_handle_valid(handle)) |
249 | return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); | ||
250 | return 0; | ||
204 | } | 251 | } |
205 | 252 | ||
206 | /* super.c */ | 253 | /* super.c */ |
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb); | |||
208 | 255 | ||
209 | static inline int ext4_should_journal_data(struct inode *inode) | 256 | static inline int ext4_should_journal_data(struct inode *inode) |
210 | { | 257 | { |
258 | if (EXT4_JOURNAL(inode) == NULL) | ||
259 | return 0; | ||
211 | if (!S_ISREG(inode->i_mode)) | 260 | if (!S_ISREG(inode->i_mode)) |
212 | return 1; | 261 | return 1; |
213 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) | 262 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) |
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode) | |||
219 | 268 | ||
220 | static inline int ext4_should_order_data(struct inode *inode) | 269 | static inline int ext4_should_order_data(struct inode *inode) |
221 | { | 270 | { |
271 | if (EXT4_JOURNAL(inode) == NULL) | ||
272 | return 0; | ||
222 | if (!S_ISREG(inode->i_mode)) | 273 | if (!S_ISREG(inode->i_mode)) |
223 | return 0; | 274 | return 0; |
224 | if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) | 275 | if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) |
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode) | |||
230 | 281 | ||
231 | static inline int ext4_should_writeback_data(struct inode *inode) | 282 | static inline int ext4_should_writeback_data(struct inode *inode) |
232 | { | 283 | { |
284 | if (EXT4_JOURNAL(inode) == NULL) | ||
285 | return 0; | ||
233 | if (!S_ISREG(inode->i_mode)) | 286 | if (!S_ISREG(inode->i_mode)) |
234 | return 0; | 287 | return 0; |
235 | if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) | 288 | if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) |
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index b21f16713db0..039b6ea1a042 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h | |||
@@ -57,6 +57,7 @@ struct ext4_sb_info { | |||
57 | u32 s_next_generation; | 57 | u32 s_next_generation; |
58 | u32 s_hash_seed[4]; | 58 | u32 s_hash_seed[4]; |
59 | int s_def_hash_version; | 59 | int s_def_hash_version; |
60 | int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ | ||
60 | struct percpu_counter s_freeblocks_counter; | 61 | struct percpu_counter s_freeblocks_counter; |
61 | struct percpu_counter s_freeinodes_counter; | 62 | struct percpu_counter s_freeinodes_counter; |
62 | struct percpu_counter s_dirs_counter; | 63 | struct percpu_counter s_dirs_counter; |
@@ -73,6 +74,8 @@ struct ext4_sb_info { | |||
73 | struct journal_s *s_journal; | 74 | struct journal_s *s_journal; |
74 | struct list_head s_orphan; | 75 | struct list_head s_orphan; |
75 | unsigned long s_commit_interval; | 76 | unsigned long s_commit_interval; |
77 | u32 s_max_batch_time; | ||
78 | u32 s_min_batch_time; | ||
76 | struct block_device *journal_bdev; | 79 | struct block_device *journal_bdev; |
77 | #ifdef CONFIG_JBD2_DEBUG | 80 | #ifdef CONFIG_JBD2_DEBUG |
78 | struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ | 81 | struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ |
@@ -101,7 +104,8 @@ struct ext4_sb_info { | |||
101 | spinlock_t s_reserve_lock; | 104 | spinlock_t s_reserve_lock; |
102 | spinlock_t s_md_lock; | 105 | spinlock_t s_md_lock; |
103 | tid_t s_last_transaction; | 106 | tid_t s_last_transaction; |
104 | unsigned short *s_mb_offsets, *s_mb_maxs; | 107 | unsigned short *s_mb_offsets; |
108 | unsigned int *s_mb_maxs; | ||
105 | 109 | ||
106 | /* tunables */ | 110 | /* tunables */ |
107 | unsigned long s_stripe; | 111 | unsigned long s_stripe; |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea2ce3c0ae66..54bf0623a9ae 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) | |||
97 | { | 97 | { |
98 | int err; | 98 | int err; |
99 | 99 | ||
100 | if (!ext4_handle_valid(handle)) | ||
101 | return 0; | ||
100 | if (handle->h_buffer_credits > needed) | 102 | if (handle->h_buffer_credits > needed) |
101 | return 0; | 103 | return 0; |
102 | err = ext4_journal_extend(handle, needed); | 104 | err = ext4_journal_extend(handle, needed); |
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, | |||
134 | int err; | 136 | int err; |
135 | if (path->p_bh) { | 137 | if (path->p_bh) { |
136 | /* path points to block */ | 138 | /* path points to block */ |
137 | err = ext4_journal_dirty_metadata(handle, path->p_bh); | 139 | err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); |
138 | } else { | 140 | } else { |
139 | /* path points to leaf/index in inode body */ | 141 | /* path points to leaf/index in inode body */ |
140 | err = ext4_mark_inode_dirty(handle, inode); | 142 | err = ext4_mark_inode_dirty(handle, inode); |
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, | |||
191 | ext4_fsblk_t goal, newblock; | 193 | ext4_fsblk_t goal, newblock; |
192 | 194 | ||
193 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); | 195 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); |
194 | newblock = ext4_new_meta_block(handle, inode, goal, err); | 196 | newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); |
195 | return newblock; | 197 | return newblock; |
196 | } | 198 | } |
197 | 199 | ||
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |||
780 | set_buffer_uptodate(bh); | 782 | set_buffer_uptodate(bh); |
781 | unlock_buffer(bh); | 783 | unlock_buffer(bh); |
782 | 784 | ||
783 | err = ext4_journal_dirty_metadata(handle, bh); | 785 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
784 | if (err) | 786 | if (err) |
785 | goto cleanup; | 787 | goto cleanup; |
786 | brelse(bh); | 788 | brelse(bh); |
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |||
859 | set_buffer_uptodate(bh); | 861 | set_buffer_uptodate(bh); |
860 | unlock_buffer(bh); | 862 | unlock_buffer(bh); |
861 | 863 | ||
862 | err = ext4_journal_dirty_metadata(handle, bh); | 864 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
863 | if (err) | 865 | if (err) |
864 | goto cleanup; | 866 | goto cleanup; |
865 | brelse(bh); | 867 | brelse(bh); |
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, | |||
955 | set_buffer_uptodate(bh); | 957 | set_buffer_uptodate(bh); |
956 | unlock_buffer(bh); | 958 | unlock_buffer(bh); |
957 | 959 | ||
958 | err = ext4_journal_dirty_metadata(handle, bh); | 960 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
959 | if (err) | 961 | if (err) |
960 | goto out; | 962 | goto out; |
961 | 963 | ||
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, | |||
1160 | while (--depth >= 0) { | 1162 | while (--depth >= 0) { |
1161 | ix = path[depth].p_idx; | 1163 | ix = path[depth].p_idx; |
1162 | if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) | 1164 | if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) |
1163 | break; | 1165 | goto got_index; |
1164 | } | 1166 | } |
1165 | 1167 | ||
1166 | if (depth < 0) { | 1168 | /* we've gone up to the root and found no index to the right */ |
1167 | /* we've gone up to the root and | 1169 | return 0; |
1168 | * found no index to the right */ | ||
1169 | return 0; | ||
1170 | } | ||
1171 | 1170 | ||
1171 | got_index: | ||
1172 | /* we've found index to the right, let's | 1172 | /* we've found index to the right, let's |
1173 | * follow it and find the closest allocated | 1173 | * follow it and find the closest allocated |
1174 | * block to the right */ | 1174 | * block to the right */ |
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, | |||
1201 | *phys = ext_pblock(ex); | 1201 | *phys = ext_pblock(ex); |
1202 | put_bh(bh); | 1202 | put_bh(bh); |
1203 | return 0; | 1203 | return 0; |
1204 | |||
1205 | } | 1204 | } |
1206 | 1205 | ||
1207 | /* | 1206 | /* |
@@ -1622,7 +1621,6 @@ cleanup: | |||
1622 | ext4_ext_drop_refs(npath); | 1621 | ext4_ext_drop_refs(npath); |
1623 | kfree(npath); | 1622 | kfree(npath); |
1624 | } | 1623 | } |
1625 | ext4_ext_tree_changed(inode); | ||
1626 | ext4_ext_invalidate_cache(inode); | 1624 | ext4_ext_invalidate_cache(inode); |
1627 | return err; | 1625 | return err; |
1628 | } | 1626 | } |
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) | |||
2233 | } | 2231 | } |
2234 | } | 2232 | } |
2235 | out: | 2233 | out: |
2236 | ext4_ext_tree_changed(inode); | ||
2237 | ext4_ext_drop_refs(path); | 2234 | ext4_ext_drop_refs(path); |
2238 | kfree(path); | 2235 | kfree(path); |
2239 | ext4_journal_stop(handle); | 2236 | ext4_journal_stop(handle); |
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb) | |||
2250 | * possible initialization would be here | 2247 | * possible initialization would be here |
2251 | */ | 2248 | */ |
2252 | 2249 | ||
2253 | if (test_opt(sb, EXTENTS)) { | 2250 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { |
2254 | printk(KERN_INFO "EXT4-fs: file extents enabled"); | 2251 | printk(KERN_INFO "EXT4-fs: file extents enabled"); |
2255 | #ifdef AGGRESSIVE_TEST | 2252 | #ifdef AGGRESSIVE_TEST |
2256 | printk(", aggressive tests"); | 2253 | printk(", aggressive tests"); |
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb) | |||
2275 | */ | 2272 | */ |
2276 | void ext4_ext_release(struct super_block *sb) | 2273 | void ext4_ext_release(struct super_block *sb) |
2277 | { | 2274 | { |
2278 | if (!test_opt(sb, EXTENTS)) | 2275 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) |
2279 | return; | 2276 | return; |
2280 | 2277 | ||
2281 | #ifdef EXTENTS_STATS | 2278 | #ifdef EXTENTS_STATS |
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |||
2380 | struct inode *inode, | 2377 | struct inode *inode, |
2381 | struct ext4_ext_path *path, | 2378 | struct ext4_ext_path *path, |
2382 | ext4_lblk_t iblock, | 2379 | ext4_lblk_t iblock, |
2383 | unsigned long max_blocks) | 2380 | unsigned int max_blocks) |
2384 | { | 2381 | { |
2385 | struct ext4_extent *ex, newex, orig_ex; | 2382 | struct ext4_extent *ex, newex, orig_ex; |
2386 | struct ext4_extent *ex1 = NULL; | 2383 | struct ext4_extent *ex1 = NULL; |
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |||
2536 | */ | 2533 | */ |
2537 | newdepth = ext_depth(inode); | 2534 | newdepth = ext_depth(inode); |
2538 | /* | 2535 | /* |
2539 | * update the extent length after successfull insert of the | 2536 | * update the extent length after successful insert of the |
2540 | * split extent | 2537 | * split extent |
2541 | */ | 2538 | */ |
2542 | orig_ex.ee_len = cpu_to_le16(ee_len - | 2539 | orig_ex.ee_len = cpu_to_le16(ee_len - |
@@ -2678,26 +2675,26 @@ fix_extent_len: | |||
2678 | */ | 2675 | */ |
2679 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | 2676 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
2680 | ext4_lblk_t iblock, | 2677 | ext4_lblk_t iblock, |
2681 | unsigned long max_blocks, struct buffer_head *bh_result, | 2678 | unsigned int max_blocks, struct buffer_head *bh_result, |
2682 | int create, int extend_disksize) | 2679 | int create, int extend_disksize) |
2683 | { | 2680 | { |
2684 | struct ext4_ext_path *path = NULL; | 2681 | struct ext4_ext_path *path = NULL; |
2685 | struct ext4_extent_header *eh; | 2682 | struct ext4_extent_header *eh; |
2686 | struct ext4_extent newex, *ex; | 2683 | struct ext4_extent newex, *ex; |
2687 | ext4_fsblk_t goal, newblock; | 2684 | ext4_fsblk_t newblock; |
2688 | int err = 0, depth, ret; | 2685 | int err = 0, depth, ret, cache_type; |
2689 | unsigned long allocated = 0; | 2686 | unsigned int allocated = 0; |
2690 | struct ext4_allocation_request ar; | 2687 | struct ext4_allocation_request ar; |
2691 | loff_t disksize; | 2688 | loff_t disksize; |
2692 | 2689 | ||
2693 | __clear_bit(BH_New, &bh_result->b_state); | 2690 | __clear_bit(BH_New, &bh_result->b_state); |
2694 | ext_debug("blocks %u/%lu requested for inode %u\n", | 2691 | ext_debug("blocks %u/%u requested for inode %u\n", |
2695 | iblock, max_blocks, inode->i_ino); | 2692 | iblock, max_blocks, inode->i_ino); |
2696 | 2693 | ||
2697 | /* check in cache */ | 2694 | /* check in cache */ |
2698 | goal = ext4_ext_in_cache(inode, iblock, &newex); | 2695 | cache_type = ext4_ext_in_cache(inode, iblock, &newex); |
2699 | if (goal) { | 2696 | if (cache_type) { |
2700 | if (goal == EXT4_EXT_CACHE_GAP) { | 2697 | if (cache_type == EXT4_EXT_CACHE_GAP) { |
2701 | if (!create) { | 2698 | if (!create) { |
2702 | /* | 2699 | /* |
2703 | * block isn't allocated yet and | 2700 | * block isn't allocated yet and |
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2706 | goto out2; | 2703 | goto out2; |
2707 | } | 2704 | } |
2708 | /* we should allocate requested block */ | 2705 | /* we should allocate requested block */ |
2709 | } else if (goal == EXT4_EXT_CACHE_EXTENT) { | 2706 | } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { |
2710 | /* block is already allocated */ | 2707 | /* block is already allocated */ |
2711 | newblock = iblock | 2708 | newblock = iblock |
2712 | - le32_to_cpu(newex.ee_block) | 2709 | - le32_to_cpu(newex.ee_block) |
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2854 | if (!newblock) | 2851 | if (!newblock) |
2855 | goto out2; | 2852 | goto out2; |
2856 | ext_debug("allocate new block: goal %llu, found %llu/%lu\n", | 2853 | ext_debug("allocate new block: goal %llu, found %llu/%lu\n", |
2857 | goal, newblock, allocated); | 2854 | ar.goal, newblock, allocated); |
2858 | 2855 | ||
2859 | /* try to insert new extent into found leaf and return */ | 2856 | /* try to insert new extent into found leaf and return */ |
2860 | ext4_ext_store_pblock(&newex, newblock); | 2857 | ext4_ext_store_pblock(&newex, newblock); |
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
2950 | * transaction synchronous. | 2947 | * transaction synchronous. |
2951 | */ | 2948 | */ |
2952 | if (IS_SYNC(inode)) | 2949 | if (IS_SYNC(inode)) |
2953 | handle->h_sync = 1; | 2950 | ext4_handle_sync(handle); |
2954 | 2951 | ||
2955 | out_stop: | 2952 | out_stop: |
2956 | up_write(&EXT4_I(inode)->i_data_sem); | 2953 | up_write(&EXT4_I(inode)->i_data_sem); |
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) | |||
3004 | handle_t *handle; | 3001 | handle_t *handle; |
3005 | ext4_lblk_t block; | 3002 | ext4_lblk_t block; |
3006 | loff_t new_size; | 3003 | loff_t new_size; |
3007 | unsigned long max_blocks; | 3004 | unsigned int max_blocks; |
3008 | int ret = 0; | 3005 | int ret = 0; |
3009 | int ret2 = 0; | 3006 | int ret2 = 0; |
3010 | int retries = 0; | 3007 | int retries = 0; |
@@ -3083,7 +3080,7 @@ retry: | |||
3083 | /* | 3080 | /* |
3084 | * Callback function called for each extent to gather FIEMAP information. | 3081 | * Callback function called for each extent to gather FIEMAP information. |
3085 | */ | 3082 | */ |
3086 | int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, | 3083 | static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, |
3087 | struct ext4_ext_cache *newex, struct ext4_extent *ex, | 3084 | struct ext4_ext_cache *newex, struct ext4_extent *ex, |
3088 | void *data) | 3085 | void *data) |
3089 | { | 3086 | { |
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, | |||
3152 | /* fiemap flags we can handle specified here */ | 3149 | /* fiemap flags we can handle specified here */ |
3153 | #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) | 3150 | #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) |
3154 | 3151 | ||
3155 | int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) | 3152 | static int ext4_xattr_fiemap(struct inode *inode, |
3153 | struct fiemap_extent_info *fieinfo) | ||
3156 | { | 3154 | { |
3157 | __u64 physical = 0; | 3155 | __u64 physical = 0; |
3158 | __u64 length; | 3156 | __u64 length; |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6bd11fba71f7..f731cb545a03 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
140 | return 0; | 140 | return 0; |
141 | } | 141 | } |
142 | 142 | ||
143 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | ||
144 | __u64 start, __u64 len); | ||
145 | |||
146 | const struct file_operations ext4_file_operations = { | 143 | const struct file_operations ext4_file_operations = { |
147 | .llseek = generic_file_llseek, | 144 | .llseek = generic_file_llseek, |
148 | .read = do_sync_read, | 145 | .read = do_sync_read, |
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 556ca8eba3db..ac8f168c8ab4 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c | |||
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) | |||
35 | 35 | ||
36 | 36 | ||
37 | /* The old legacy hash */ | 37 | /* The old legacy hash */ |
38 | static __u32 dx_hack_hash(const char *name, int len) | 38 | static __u32 dx_hack_hash_unsigned(const char *name, int len) |
39 | { | 39 | { |
40 | __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | 40 | __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; |
41 | const unsigned char *ucp = (const unsigned char *) name; | ||
42 | |||
43 | while (len--) { | ||
44 | hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); | ||
45 | |||
46 | if (hash & 0x80000000) | ||
47 | hash -= 0x7fffffff; | ||
48 | hash1 = hash0; | ||
49 | hash0 = hash; | ||
50 | } | ||
51 | return hash0 << 1; | ||
52 | } | ||
53 | |||
54 | static __u32 dx_hack_hash_signed(const char *name, int len) | ||
55 | { | ||
56 | __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; | ||
57 | const signed char *scp = (const signed char *) name; | ||
58 | |||
41 | while (len--) { | 59 | while (len--) { |
42 | __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); | 60 | hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); |
43 | 61 | ||
44 | if (hash & 0x80000000) hash -= 0x7fffffff; | 62 | if (hash & 0x80000000) |
63 | hash -= 0x7fffffff; | ||
45 | hash1 = hash0; | 64 | hash1 = hash0; |
46 | hash0 = hash; | 65 | hash0 = hash; |
47 | } | 66 | } |
48 | return (hash0 << 1); | 67 | return hash0 << 1; |
68 | } | ||
69 | |||
70 | static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) | ||
71 | { | ||
72 | __u32 pad, val; | ||
73 | int i; | ||
74 | const signed char *scp = (const signed char *) msg; | ||
75 | |||
76 | pad = (__u32)len | ((__u32)len << 8); | ||
77 | pad |= pad << 16; | ||
78 | |||
79 | val = pad; | ||
80 | if (len > num*4) | ||
81 | len = num * 4; | ||
82 | for (i = 0; i < len; i++) { | ||
83 | if ((i % 4) == 0) | ||
84 | val = pad; | ||
85 | val = ((int) scp[i]) + (val << 8); | ||
86 | if ((i % 4) == 3) { | ||
87 | *buf++ = val; | ||
88 | val = pad; | ||
89 | num--; | ||
90 | } | ||
91 | } | ||
92 | if (--num >= 0) | ||
93 | *buf++ = val; | ||
94 | while (--num >= 0) | ||
95 | *buf++ = pad; | ||
49 | } | 96 | } |
50 | 97 | ||
51 | static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | 98 | static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) |
52 | { | 99 | { |
53 | __u32 pad, val; | 100 | __u32 pad, val; |
54 | int i; | 101 | int i; |
102 | const unsigned char *ucp = (const unsigned char *) msg; | ||
55 | 103 | ||
56 | pad = (__u32)len | ((__u32)len << 8); | 104 | pad = (__u32)len | ((__u32)len << 8); |
57 | pad |= pad << 16; | 105 | pad |= pad << 16; |
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) | |||
62 | for (i = 0; i < len; i++) { | 110 | for (i = 0; i < len; i++) { |
63 | if ((i % 4) == 0) | 111 | if ((i % 4) == 0) |
64 | val = pad; | 112 | val = pad; |
65 | val = msg[i] + (val << 8); | 113 | val = ((int) ucp[i]) + (val << 8); |
66 | if ((i % 4) == 3) { | 114 | if ((i % 4) == 3) { |
67 | *buf++ = val; | 115 | *buf++ = val; |
68 | val = pad; | 116 | val = pad; |
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
95 | const char *p; | 143 | const char *p; |
96 | int i; | 144 | int i; |
97 | __u32 in[8], buf[4]; | 145 | __u32 in[8], buf[4]; |
146 | void (*str2hashbuf)(const char *, int, __u32 *, int) = | ||
147 | str2hashbuf_signed; | ||
98 | 148 | ||
99 | /* Initialize the default seed for the hash checksum functions */ | 149 | /* Initialize the default seed for the hash checksum functions */ |
100 | buf[0] = 0x67452301; | 150 | buf[0] = 0x67452301; |
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
113 | } | 163 | } |
114 | 164 | ||
115 | switch (hinfo->hash_version) { | 165 | switch (hinfo->hash_version) { |
166 | case DX_HASH_LEGACY_UNSIGNED: | ||
167 | hash = dx_hack_hash_unsigned(name, len); | ||
168 | break; | ||
116 | case DX_HASH_LEGACY: | 169 | case DX_HASH_LEGACY: |
117 | hash = dx_hack_hash(name, len); | 170 | hash = dx_hack_hash_signed(name, len); |
118 | break; | 171 | break; |
172 | case DX_HASH_HALF_MD4_UNSIGNED: | ||
173 | str2hashbuf = str2hashbuf_unsigned; | ||
119 | case DX_HASH_HALF_MD4: | 174 | case DX_HASH_HALF_MD4: |
120 | p = name; | 175 | p = name; |
121 | while (len > 0) { | 176 | while (len > 0) { |
122 | str2hashbuf(p, len, in, 8); | 177 | (*str2hashbuf)(p, len, in, 8); |
123 | half_md4_transform(buf, in); | 178 | half_md4_transform(buf, in); |
124 | len -= 32; | 179 | len -= 32; |
125 | p += 32; | 180 | p += 32; |
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) | |||
127 | minor_hash = buf[2]; | 182 | minor_hash = buf[2]; |
128 | hash = buf[1]; | 183 | hash = buf[1]; |
129 | break; | 184 | break; |
185 | case DX_HASH_TEA_UNSIGNED: | ||
186 | str2hashbuf = str2hashbuf_unsigned; | ||
130 | case DX_HASH_TEA: | 187 | case DX_HASH_TEA: |
131 | p = name; | 188 | p = name; |
132 | while (len > 0) { | 189 | while (len > 0) { |
133 | str2hashbuf(p, len, in, 4); | 190 | (*str2hashbuf)(p, len, in, 4); |
134 | TEA_transform(buf, in); | 191 | TEA_transform(buf, in); |
135 | len -= 16; | 192 | len -= 16; |
136 | p += 16; | 193 | p += 16; |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6e6052879aa2..4fb86a0061d0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, | |||
74 | /* If checksum is bad mark all blocks and inodes use to prevent | 74 | /* If checksum is bad mark all blocks and inodes use to prevent |
75 | * allocation, essentially implementing a per-group read-only flag. */ | 75 | * allocation, essentially implementing a per-group read-only flag. */ |
76 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { | 76 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { |
77 | ext4_error(sb, __func__, "Checksum bad for group %lu\n", | 77 | ext4_error(sb, __func__, "Checksum bad for group %u", |
78 | block_group); | 78 | block_group); |
79 | gdp->bg_free_blocks_count = 0; | 79 | ext4_free_blks_set(sb, gdp, 0); |
80 | gdp->bg_free_inodes_count = 0; | 80 | ext4_free_inodes_set(sb, gdp, 0); |
81 | gdp->bg_itable_unused = 0; | 81 | ext4_itable_unused_set(sb, gdp, 0); |
82 | memset(bh->b_data, 0xff, sb->s_blocksize); | 82 | memset(bh->b_data, 0xff, sb->s_blocksize); |
83 | return 0; | 83 | return 0; |
84 | } | 84 | } |
85 | 85 | ||
86 | memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); | 86 | memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); |
87 | mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), | 87 | mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, |
88 | bh->b_data); | 88 | bh->b_data); |
89 | 89 | ||
90 | return EXT4_INODES_PER_GROUP(sb); | 90 | return EXT4_INODES_PER_GROUP(sb); |
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
111 | if (unlikely(!bh)) { | 111 | if (unlikely(!bh)) { |
112 | ext4_error(sb, __func__, | 112 | ext4_error(sb, __func__, |
113 | "Cannot read inode bitmap - " | 113 | "Cannot read inode bitmap - " |
114 | "block_group = %lu, inode_bitmap = %llu", | 114 | "block_group = %u, inode_bitmap = %llu", |
115 | block_group, bitmap_blk); | 115 | block_group, bitmap_blk); |
116 | return NULL; | 116 | return NULL; |
117 | } | 117 | } |
118 | if (buffer_uptodate(bh) && | 118 | if (bitmap_uptodate(bh)) |
119 | !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) | ||
120 | return bh; | 119 | return bh; |
121 | 120 | ||
122 | lock_buffer(bh); | 121 | lock_buffer(bh); |
122 | if (bitmap_uptodate(bh)) { | ||
123 | unlock_buffer(bh); | ||
124 | return bh; | ||
125 | } | ||
123 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 126 | spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
124 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | 127 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { |
125 | ext4_init_inode_bitmap(sb, bh, block_group, desc); | 128 | ext4_init_inode_bitmap(sb, bh, block_group, desc); |
129 | set_bitmap_uptodate(bh); | ||
126 | set_buffer_uptodate(bh); | 130 | set_buffer_uptodate(bh); |
127 | unlock_buffer(bh); | ||
128 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 131 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
132 | unlock_buffer(bh); | ||
129 | return bh; | 133 | return bh; |
130 | } | 134 | } |
131 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | 135 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); |
136 | if (buffer_uptodate(bh)) { | ||
137 | /* | ||
138 | * if not uninit if bh is uptodate, | ||
139 | * bitmap is also uptodate | ||
140 | */ | ||
141 | set_bitmap_uptodate(bh); | ||
142 | unlock_buffer(bh); | ||
143 | return bh; | ||
144 | } | ||
145 | /* | ||
146 | * submit the buffer_head for read. We can | ||
147 | * safely mark the bitmap as uptodate now. | ||
148 | * We do it here so the bitmap uptodate bit | ||
149 | * get set with buffer lock held. | ||
150 | */ | ||
151 | set_bitmap_uptodate(bh); | ||
132 | if (bh_submit_read(bh) < 0) { | 152 | if (bh_submit_read(bh) < 0) { |
133 | put_bh(bh); | 153 | put_bh(bh); |
134 | ext4_error(sb, __func__, | 154 | ext4_error(sb, __func__, |
135 | "Cannot read inode bitmap - " | 155 | "Cannot read inode bitmap - " |
136 | "block_group = %lu, inode_bitmap = %llu", | 156 | "block_group = %u, inode_bitmap = %llu", |
137 | block_group, bitmap_blk); | 157 | block_group, bitmap_blk); |
138 | return NULL; | 158 | return NULL; |
139 | } | 159 | } |
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) | |||
168 | struct ext4_group_desc *gdp; | 188 | struct ext4_group_desc *gdp; |
169 | struct ext4_super_block *es; | 189 | struct ext4_super_block *es; |
170 | struct ext4_sb_info *sbi; | 190 | struct ext4_sb_info *sbi; |
171 | int fatal = 0, err; | 191 | int fatal = 0, err, count; |
172 | ext4_group_t flex_group; | 192 | ext4_group_t flex_group; |
173 | 193 | ||
174 | if (atomic_read(&inode->i_count) > 1) { | 194 | if (atomic_read(&inode->i_count) > 1) { |
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) | |||
190 | 210 | ||
191 | ino = inode->i_ino; | 211 | ino = inode->i_ino; |
192 | ext4_debug("freeing inode %lu\n", ino); | 212 | ext4_debug("freeing inode %lu\n", ino); |
213 | trace_mark(ext4_free_inode, | ||
214 | "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", | ||
215 | sb->s_id, inode->i_ino, inode->i_mode, | ||
216 | (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, | ||
217 | (unsigned long long) inode->i_blocks); | ||
193 | 218 | ||
194 | /* | 219 | /* |
195 | * Note: we must free any quota before locking the superblock, | 220 | * Note: we must free any quota before locking the superblock, |
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) | |||
236 | 261 | ||
237 | if (gdp) { | 262 | if (gdp) { |
238 | spin_lock(sb_bgl_lock(sbi, block_group)); | 263 | spin_lock(sb_bgl_lock(sbi, block_group)); |
239 | le16_add_cpu(&gdp->bg_free_inodes_count, 1); | 264 | count = ext4_free_inodes_count(sb, gdp) + 1; |
240 | if (is_directory) | 265 | ext4_free_inodes_set(sb, gdp, count); |
241 | le16_add_cpu(&gdp->bg_used_dirs_count, -1); | 266 | if (is_directory) { |
267 | count = ext4_used_dirs_count(sb, gdp) - 1; | ||
268 | ext4_used_dirs_set(sb, gdp, count); | ||
269 | } | ||
242 | gdp->bg_checksum = ext4_group_desc_csum(sbi, | 270 | gdp->bg_checksum = ext4_group_desc_csum(sbi, |
243 | block_group, gdp); | 271 | block_group, gdp); |
244 | spin_unlock(sb_bgl_lock(sbi, block_group)); | 272 | spin_unlock(sb_bgl_lock(sbi, block_group)); |
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) | |||
253 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | 281 | spin_unlock(sb_bgl_lock(sbi, flex_group)); |
254 | } | 282 | } |
255 | } | 283 | } |
256 | BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); | 284 | BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); |
257 | err = ext4_journal_dirty_metadata(handle, bh2); | 285 | err = ext4_handle_dirty_metadata(handle, NULL, bh2); |
258 | if (!fatal) fatal = err; | 286 | if (!fatal) fatal = err; |
259 | } | 287 | } |
260 | BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); | 288 | BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); |
261 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 289 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
262 | if (!fatal) | 290 | if (!fatal) |
263 | fatal = err; | 291 | fatal = err; |
264 | sb->s_dirt = 1; | 292 | sb->s_dirt = 1; |
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, | |||
291 | 319 | ||
292 | for (group = 0; group < ngroups; group++) { | 320 | for (group = 0; group < ngroups; group++) { |
293 | desc = ext4_get_group_desc(sb, group, NULL); | 321 | desc = ext4_get_group_desc(sb, group, NULL); |
294 | if (!desc || !desc->bg_free_inodes_count) | 322 | if (!desc || !ext4_free_inodes_count(sb, desc)) |
295 | continue; | 323 | continue; |
296 | if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) | 324 | if (ext4_free_inodes_count(sb, desc) < avefreei) |
297 | continue; | 325 | continue; |
298 | if (!best_desc || | 326 | if (!best_desc || |
299 | (le16_to_cpu(desc->bg_free_blocks_count) > | 327 | (ext4_free_blks_count(sb, desc) > |
300 | le16_to_cpu(best_desc->bg_free_blocks_count))) { | 328 | ext4_free_blks_count(sb, best_desc))) { |
301 | *best_group = group; | 329 | *best_group = group; |
302 | best_desc = desc; | 330 | best_desc = desc; |
303 | ret = 0; | 331 | ret = 0; |
@@ -369,7 +397,7 @@ found_flexbg: | |||
369 | for (i = best_flex * flex_size; i < ngroups && | 397 | for (i = best_flex * flex_size; i < ngroups && |
370 | i < (best_flex + 1) * flex_size; i++) { | 398 | i < (best_flex + 1) * flex_size; i++) { |
371 | desc = ext4_get_group_desc(sb, i, &bh); | 399 | desc = ext4_get_group_desc(sb, i, &bh); |
372 | if (le16_to_cpu(desc->bg_free_inodes_count)) { | 400 | if (ext4_free_inodes_count(sb, desc)) { |
373 | *best_group = i; | 401 | *best_group = i; |
374 | goto out; | 402 | goto out; |
375 | } | 403 | } |
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
443 | for (i = 0; i < ngroups; i++) { | 471 | for (i = 0; i < ngroups; i++) { |
444 | grp = (parent_group + i) % ngroups; | 472 | grp = (parent_group + i) % ngroups; |
445 | desc = ext4_get_group_desc(sb, grp, NULL); | 473 | desc = ext4_get_group_desc(sb, grp, NULL); |
446 | if (!desc || !desc->bg_free_inodes_count) | 474 | if (!desc || !ext4_free_inodes_count(sb, desc)) |
447 | continue; | 475 | continue; |
448 | if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) | 476 | if (ext4_used_dirs_count(sb, desc) >= best_ndir) |
449 | continue; | 477 | continue; |
450 | if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) | 478 | if (ext4_free_inodes_count(sb, desc) < avefreei) |
451 | continue; | 479 | continue; |
452 | if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) | 480 | if (ext4_free_blks_count(sb, desc) < avefreeb) |
453 | continue; | 481 | continue; |
454 | *group = grp; | 482 | *group = grp; |
455 | ret = 0; | 483 | ret = 0; |
456 | best_ndir = le16_to_cpu(desc->bg_used_dirs_count); | 484 | best_ndir = ext4_used_dirs_count(sb, desc); |
457 | } | 485 | } |
458 | if (ret == 0) | 486 | if (ret == 0) |
459 | return ret; | 487 | return ret; |
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
479 | for (i = 0; i < ngroups; i++) { | 507 | for (i = 0; i < ngroups; i++) { |
480 | *group = (parent_group + i) % ngroups; | 508 | *group = (parent_group + i) % ngroups; |
481 | desc = ext4_get_group_desc(sb, *group, NULL); | 509 | desc = ext4_get_group_desc(sb, *group, NULL); |
482 | if (!desc || !desc->bg_free_inodes_count) | 510 | if (!desc || !ext4_free_inodes_count(sb, desc)) |
483 | continue; | 511 | continue; |
484 | if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) | 512 | if (ext4_used_dirs_count(sb, desc) >= max_dirs) |
485 | continue; | 513 | continue; |
486 | if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) | 514 | if (ext4_free_inodes_count(sb, desc) < min_inodes) |
487 | continue; | 515 | continue; |
488 | if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) | 516 | if (ext4_free_blks_count(sb, desc) < min_blocks) |
489 | continue; | 517 | continue; |
490 | return 0; | 518 | return 0; |
491 | } | 519 | } |
@@ -494,8 +522,8 @@ fallback: | |||
494 | for (i = 0; i < ngroups; i++) { | 522 | for (i = 0; i < ngroups; i++) { |
495 | *group = (parent_group + i) % ngroups; | 523 | *group = (parent_group + i) % ngroups; |
496 | desc = ext4_get_group_desc(sb, *group, NULL); | 524 | desc = ext4_get_group_desc(sb, *group, NULL); |
497 | if (desc && desc->bg_free_inodes_count && | 525 | if (desc && ext4_free_inodes_count(sb, desc) && |
498 | le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) | 526 | ext4_free_inodes_count(sb, desc) >= avefreei) |
499 | return 0; | 527 | return 0; |
500 | } | 528 | } |
501 | 529 | ||
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
524 | */ | 552 | */ |
525 | *group = parent_group; | 553 | *group = parent_group; |
526 | desc = ext4_get_group_desc(sb, *group, NULL); | 554 | desc = ext4_get_group_desc(sb, *group, NULL); |
527 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && | 555 | if (desc && ext4_free_inodes_count(sb, desc) && |
528 | le16_to_cpu(desc->bg_free_blocks_count)) | 556 | ext4_free_blks_count(sb, desc)) |
529 | return 0; | 557 | return 0; |
530 | 558 | ||
531 | /* | 559 | /* |
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
548 | if (*group >= ngroups) | 576 | if (*group >= ngroups) |
549 | *group -= ngroups; | 577 | *group -= ngroups; |
550 | desc = ext4_get_group_desc(sb, *group, NULL); | 578 | desc = ext4_get_group_desc(sb, *group, NULL); |
551 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && | 579 | if (desc && ext4_free_inodes_count(sb, desc) && |
552 | le16_to_cpu(desc->bg_free_blocks_count)) | 580 | ext4_free_blks_count(sb, desc)) |
553 | return 0; | 581 | return 0; |
554 | } | 582 | } |
555 | 583 | ||
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
562 | if (++*group >= ngroups) | 590 | if (++*group >= ngroups) |
563 | *group = 0; | 591 | *group = 0; |
564 | desc = ext4_get_group_desc(sb, *group, NULL); | 592 | desc = ext4_get_group_desc(sb, *group, NULL); |
565 | if (desc && le16_to_cpu(desc->bg_free_inodes_count)) | 593 | if (desc && ext4_free_inodes_count(sb, desc)) |
566 | return 0; | 594 | return 0; |
567 | } | 595 | } |
568 | 596 | ||
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
570 | } | 598 | } |
571 | 599 | ||
572 | /* | 600 | /* |
601 | * claim the inode from the inode bitmap. If the group | ||
602 | * is uninit we need to take the groups's sb_bgl_lock | ||
603 | * and clear the uninit flag. The inode bitmap update | ||
604 | * and group desc uninit flag clear should be done | ||
605 | * after holding sb_bgl_lock so that ext4_read_inode_bitmap | ||
606 | * doesn't race with the ext4_claim_inode | ||
607 | */ | ||
608 | static int ext4_claim_inode(struct super_block *sb, | ||
609 | struct buffer_head *inode_bitmap_bh, | ||
610 | unsigned long ino, ext4_group_t group, int mode) | ||
611 | { | ||
612 | int free = 0, retval = 0, count; | ||
613 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
614 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); | ||
615 | |||
616 | spin_lock(sb_bgl_lock(sbi, group)); | ||
617 | if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { | ||
618 | /* not a free inode */ | ||
619 | retval = 1; | ||
620 | goto err_ret; | ||
621 | } | ||
622 | ino++; | ||
623 | if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
624 | ino > EXT4_INODES_PER_GROUP(sb)) { | ||
625 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
626 | ext4_error(sb, __func__, | ||
627 | "reserved inode or inode > inodes count - " | ||
628 | "block_group = %u, inode=%lu", group, | ||
629 | ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
630 | return 1; | ||
631 | } | ||
632 | /* If we didn't allocate from within the initialized part of the inode | ||
633 | * table then we need to initialize up to this inode. */ | ||
634 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
635 | |||
636 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
637 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
638 | /* When marking the block group with | ||
639 | * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
640 | * on the value of bg_itable_unused even though | ||
641 | * mke2fs could have initialized the same for us. | ||
642 | * Instead we calculated the value below | ||
643 | */ | ||
644 | |||
645 | free = 0; | ||
646 | } else { | ||
647 | free = EXT4_INODES_PER_GROUP(sb) - | ||
648 | ext4_itable_unused_count(sb, gdp); | ||
649 | } | ||
650 | |||
651 | /* | ||
652 | * Check the relative inode number against the last used | ||
653 | * relative inode number in this group. if it is greater | ||
654 | * we need to update the bg_itable_unused count | ||
655 | * | ||
656 | */ | ||
657 | if (ino > free) | ||
658 | ext4_itable_unused_set(sb, gdp, | ||
659 | (EXT4_INODES_PER_GROUP(sb) - ino)); | ||
660 | } | ||
661 | count = ext4_free_inodes_count(sb, gdp) - 1; | ||
662 | ext4_free_inodes_set(sb, gdp, count); | ||
663 | if (S_ISDIR(mode)) { | ||
664 | count = ext4_used_dirs_count(sb, gdp) + 1; | ||
665 | ext4_used_dirs_set(sb, gdp, count); | ||
666 | } | ||
667 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
668 | err_ret: | ||
669 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
670 | return retval; | ||
671 | } | ||
672 | |||
673 | /* | ||
573 | * There are two policies for allocating an inode. If the new inode is | 674 | * There are two policies for allocating an inode. If the new inode is |
574 | * a directory, then a forward search is made for a block group with both | 675 | * a directory, then a forward search is made for a block group with both |
575 | * free space and a low directory-to-inode ratio; if that fails, then of | 676 | * free space and a low directory-to-inode ratio; if that fails, then of |
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
582 | struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | 683 | struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) |
583 | { | 684 | { |
584 | struct super_block *sb; | 685 | struct super_block *sb; |
585 | struct buffer_head *bitmap_bh = NULL; | 686 | struct buffer_head *inode_bitmap_bh = NULL; |
586 | struct buffer_head *bh2; | 687 | struct buffer_head *group_desc_bh; |
587 | ext4_group_t group = 0; | 688 | ext4_group_t group = 0; |
588 | unsigned long ino = 0; | 689 | unsigned long ino = 0; |
589 | struct inode *inode; | 690 | struct inode *inode; |
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
602 | return ERR_PTR(-EPERM); | 703 | return ERR_PTR(-EPERM); |
603 | 704 | ||
604 | sb = dir->i_sb; | 705 | sb = dir->i_sb; |
706 | trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, | ||
707 | dir->i_ino, mode); | ||
605 | inode = new_inode(sb); | 708 | inode = new_inode(sb); |
606 | if (!inode) | 709 | if (!inode) |
607 | return ERR_PTR(-ENOMEM); | 710 | return ERR_PTR(-ENOMEM); |
@@ -631,40 +734,52 @@ got_group: | |||
631 | for (i = 0; i < sbi->s_groups_count; i++) { | 734 | for (i = 0; i < sbi->s_groups_count; i++) { |
632 | err = -EIO; | 735 | err = -EIO; |
633 | 736 | ||
634 | gdp = ext4_get_group_desc(sb, group, &bh2); | 737 | gdp = ext4_get_group_desc(sb, group, &group_desc_bh); |
635 | if (!gdp) | 738 | if (!gdp) |
636 | goto fail; | 739 | goto fail; |
637 | 740 | ||
638 | brelse(bitmap_bh); | 741 | brelse(inode_bitmap_bh); |
639 | bitmap_bh = ext4_read_inode_bitmap(sb, group); | 742 | inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); |
640 | if (!bitmap_bh) | 743 | if (!inode_bitmap_bh) |
641 | goto fail; | 744 | goto fail; |
642 | 745 | ||
643 | ino = 0; | 746 | ino = 0; |
644 | 747 | ||
645 | repeat_in_this_group: | 748 | repeat_in_this_group: |
646 | ino = ext4_find_next_zero_bit((unsigned long *) | 749 | ino = ext4_find_next_zero_bit((unsigned long *) |
647 | bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); | 750 | inode_bitmap_bh->b_data, |
751 | EXT4_INODES_PER_GROUP(sb), ino); | ||
752 | |||
648 | if (ino < EXT4_INODES_PER_GROUP(sb)) { | 753 | if (ino < EXT4_INODES_PER_GROUP(sb)) { |
649 | 754 | ||
650 | BUFFER_TRACE(bitmap_bh, "get_write_access"); | 755 | BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); |
651 | err = ext4_journal_get_write_access(handle, bitmap_bh); | 756 | err = ext4_journal_get_write_access(handle, |
757 | inode_bitmap_bh); | ||
652 | if (err) | 758 | if (err) |
653 | goto fail; | 759 | goto fail; |
654 | 760 | ||
655 | if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), | 761 | BUFFER_TRACE(group_desc_bh, "get_write_access"); |
656 | ino, bitmap_bh->b_data)) { | 762 | err = ext4_journal_get_write_access(handle, |
763 | group_desc_bh); | ||
764 | if (err) | ||
765 | goto fail; | ||
766 | if (!ext4_claim_inode(sb, inode_bitmap_bh, | ||
767 | ino, group, mode)) { | ||
657 | /* we won it */ | 768 | /* we won it */ |
658 | BUFFER_TRACE(bitmap_bh, | 769 | BUFFER_TRACE(inode_bitmap_bh, |
659 | "call ext4_journal_dirty_metadata"); | 770 | "call ext4_handle_dirty_metadata"); |
660 | err = ext4_journal_dirty_metadata(handle, | 771 | err = ext4_handle_dirty_metadata(handle, |
661 | bitmap_bh); | 772 | inode, |
773 | inode_bitmap_bh); | ||
662 | if (err) | 774 | if (err) |
663 | goto fail; | 775 | goto fail; |
776 | /* zero bit is inode number 1*/ | ||
777 | ino++; | ||
664 | goto got; | 778 | goto got; |
665 | } | 779 | } |
666 | /* we lost it */ | 780 | /* we lost it */ |
667 | jbd2_journal_release_buffer(handle, bitmap_bh); | 781 | ext4_handle_release_buffer(handle, inode_bitmap_bh); |
782 | ext4_handle_release_buffer(handle, group_desc_bh); | ||
668 | 783 | ||
669 | if (++ino < EXT4_INODES_PER_GROUP(sb)) | 784 | if (++ino < EXT4_INODES_PER_GROUP(sb)) |
670 | goto repeat_in_this_group; | 785 | goto repeat_in_this_group; |
@@ -684,30 +799,16 @@ repeat_in_this_group: | |||
684 | goto out; | 799 | goto out; |
685 | 800 | ||
686 | got: | 801 | got: |
687 | ino++; | ||
688 | if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || | ||
689 | ino > EXT4_INODES_PER_GROUP(sb)) { | ||
690 | ext4_error(sb, __func__, | ||
691 | "reserved inode or inode > inodes count - " | ||
692 | "block_group = %lu, inode=%lu", group, | ||
693 | ino + group * EXT4_INODES_PER_GROUP(sb)); | ||
694 | err = -EIO; | ||
695 | goto fail; | ||
696 | } | ||
697 | |||
698 | BUFFER_TRACE(bh2, "get_write_access"); | ||
699 | err = ext4_journal_get_write_access(handle, bh2); | ||
700 | if (err) goto fail; | ||
701 | |||
702 | /* We may have to initialize the block bitmap if it isn't already */ | 802 | /* We may have to initialize the block bitmap if it isn't already */ |
703 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && | 803 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && |
704 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 804 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
705 | struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); | 805 | struct buffer_head *block_bitmap_bh; |
706 | 806 | ||
707 | BUFFER_TRACE(block_bh, "get block bitmap access"); | 807 | block_bitmap_bh = ext4_read_block_bitmap(sb, group); |
708 | err = ext4_journal_get_write_access(handle, block_bh); | 808 | BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); |
809 | err = ext4_journal_get_write_access(handle, block_bitmap_bh); | ||
709 | if (err) { | 810 | if (err) { |
710 | brelse(block_bh); | 811 | brelse(block_bitmap_bh); |
711 | goto fail; | 812 | goto fail; |
712 | } | 813 | } |
713 | 814 | ||
@@ -715,9 +816,9 @@ got: | |||
715 | spin_lock(sb_bgl_lock(sbi, group)); | 816 | spin_lock(sb_bgl_lock(sbi, group)); |
716 | /* recheck and clear flag under lock if we still need to */ | 817 | /* recheck and clear flag under lock if we still need to */ |
717 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 818 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
718 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | ||
719 | free = ext4_free_blocks_after_init(sb, group, gdp); | 819 | free = ext4_free_blocks_after_init(sb, group, gdp); |
720 | gdp->bg_free_blocks_count = cpu_to_le16(free); | 820 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
821 | ext4_free_blks_set(sb, gdp, free); | ||
721 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, | 822 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, |
722 | gdp); | 823 | gdp); |
723 | } | 824 | } |
@@ -725,55 +826,19 @@ got: | |||
725 | 826 | ||
726 | /* Don't need to dirty bitmap block if we didn't change it */ | 827 | /* Don't need to dirty bitmap block if we didn't change it */ |
727 | if (free) { | 828 | if (free) { |
728 | BUFFER_TRACE(block_bh, "dirty block bitmap"); | 829 | BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); |
729 | err = ext4_journal_dirty_metadata(handle, block_bh); | 830 | err = ext4_handle_dirty_metadata(handle, |
831 | NULL, block_bitmap_bh); | ||
730 | } | 832 | } |
731 | 833 | ||
732 | brelse(block_bh); | 834 | brelse(block_bitmap_bh); |
733 | if (err) | 835 | if (err) |
734 | goto fail; | 836 | goto fail; |
735 | } | 837 | } |
736 | 838 | BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); | |
737 | spin_lock(sb_bgl_lock(sbi, group)); | 839 | err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); |
738 | /* If we didn't allocate from within the initialized part of the inode | 840 | if (err) |
739 | * table then we need to initialize up to this inode. */ | 841 | goto fail; |
740 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { | ||
741 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | ||
742 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); | ||
743 | |||
744 | /* When marking the block group with | ||
745 | * ~EXT4_BG_INODE_UNINIT we don't want to depend | ||
746 | * on the value of bg_itable_unused even though | ||
747 | * mke2fs could have initialized the same for us. | ||
748 | * Instead we calculated the value below | ||
749 | */ | ||
750 | |||
751 | free = 0; | ||
752 | } else { | ||
753 | free = EXT4_INODES_PER_GROUP(sb) - | ||
754 | le16_to_cpu(gdp->bg_itable_unused); | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Check the relative inode number against the last used | ||
759 | * relative inode number in this group. if it is greater | ||
760 | * we need to update the bg_itable_unused count | ||
761 | * | ||
762 | */ | ||
763 | if (ino > free) | ||
764 | gdp->bg_itable_unused = | ||
765 | cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); | ||
766 | } | ||
767 | |||
768 | le16_add_cpu(&gdp->bg_free_inodes_count, -1); | ||
769 | if (S_ISDIR(mode)) { | ||
770 | le16_add_cpu(&gdp->bg_used_dirs_count, 1); | ||
771 | } | ||
772 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); | ||
773 | spin_unlock(sb_bgl_lock(sbi, group)); | ||
774 | BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); | ||
775 | err = ext4_journal_dirty_metadata(handle, bh2); | ||
776 | if (err) goto fail; | ||
777 | 842 | ||
778 | percpu_counter_dec(&sbi->s_freeinodes_counter); | 843 | percpu_counter_dec(&sbi->s_freeinodes_counter); |
779 | if (S_ISDIR(mode)) | 844 | if (S_ISDIR(mode)) |
@@ -825,7 +890,7 @@ got: | |||
825 | 890 | ||
826 | ext4_set_inode_flags(inode); | 891 | ext4_set_inode_flags(inode); |
827 | if (IS_DIRSYNC(inode)) | 892 | if (IS_DIRSYNC(inode)) |
828 | handle->h_sync = 1; | 893 | ext4_handle_sync(handle); |
829 | if (insert_inode_locked(inode) < 0) { | 894 | if (insert_inode_locked(inode) < 0) { |
830 | err = -EINVAL; | 895 | err = -EINVAL; |
831 | goto fail_drop; | 896 | goto fail_drop; |
@@ -852,7 +917,7 @@ got: | |||
852 | if (err) | 917 | if (err) |
853 | goto fail_free_drop; | 918 | goto fail_free_drop; |
854 | 919 | ||
855 | if (test_opt(sb, EXTENTS)) { | 920 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { |
856 | /* set extent flag only for directory, file and normal symlink*/ | 921 | /* set extent flag only for directory, file and normal symlink*/ |
857 | if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { | 922 | if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { |
858 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; | 923 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; |
@@ -867,6 +932,8 @@ got: | |||
867 | } | 932 | } |
868 | 933 | ||
869 | ext4_debug("allocating inode %lu\n", inode->i_ino); | 934 | ext4_debug("allocating inode %lu\n", inode->i_ino); |
935 | trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", | ||
936 | sb->s_id, inode->i_ino, dir->i_ino, mode); | ||
870 | goto really_out; | 937 | goto really_out; |
871 | fail: | 938 | fail: |
872 | ext4_std_error(sb, err); | 939 | ext4_std_error(sb, err); |
@@ -874,7 +941,7 @@ out: | |||
874 | iput(inode); | 941 | iput(inode); |
875 | ret = ERR_PTR(err); | 942 | ret = ERR_PTR(err); |
876 | really_out: | 943 | really_out: |
877 | brelse(bitmap_bh); | 944 | brelse(inode_bitmap_bh); |
878 | return ret; | 945 | return ret; |
879 | 946 | ||
880 | fail_free_drop: | 947 | fail_free_drop: |
@@ -886,7 +953,7 @@ fail_drop: | |||
886 | inode->i_nlink = 0; | 953 | inode->i_nlink = 0; |
887 | unlock_new_inode(inode); | 954 | unlock_new_inode(inode); |
888 | iput(inode); | 955 | iput(inode); |
889 | brelse(bitmap_bh); | 956 | brelse(inode_bitmap_bh); |
890 | return ERR_PTR(err); | 957 | return ERR_PTR(err); |
891 | } | 958 | } |
892 | 959 | ||
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
985 | gdp = ext4_get_group_desc(sb, i, NULL); | 1052 | gdp = ext4_get_group_desc(sb, i, NULL); |
986 | if (!gdp) | 1053 | if (!gdp) |
987 | continue; | 1054 | continue; |
988 | desc_count += le16_to_cpu(gdp->bg_free_inodes_count); | 1055 | desc_count += ext4_free_inodes_count(sb, gdp); |
989 | brelse(bitmap_bh); | 1056 | brelse(bitmap_bh); |
990 | bitmap_bh = ext4_read_inode_bitmap(sb, i); | 1057 | bitmap_bh = ext4_read_inode_bitmap(sb, i); |
991 | if (!bitmap_bh) | 1058 | if (!bitmap_bh) |
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
993 | 1060 | ||
994 | x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); | 1061 | x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); |
995 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", | 1062 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", |
996 | i, le16_to_cpu(gdp->bg_free_inodes_count), x); | 1063 | i, ext4_free_inodes_count(sb, gdp), x); |
997 | bitmap_count += x; | 1064 | bitmap_count += x; |
998 | } | 1065 | } |
999 | brelse(bitmap_bh); | 1066 | brelse(bitmap_bh); |
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
1007 | gdp = ext4_get_group_desc(sb, i, NULL); | 1074 | gdp = ext4_get_group_desc(sb, i, NULL); |
1008 | if (!gdp) | 1075 | if (!gdp) |
1009 | continue; | 1076 | continue; |
1010 | desc_count += le16_to_cpu(gdp->bg_free_inodes_count); | 1077 | desc_count += ext4_free_inodes_count(sb, gdp); |
1011 | cond_resched(); | 1078 | cond_resched(); |
1012 | } | 1079 | } |
1013 | return desc_count; | 1080 | return desc_count; |
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) | |||
1024 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); | 1091 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); |
1025 | if (!gdp) | 1092 | if (!gdp) |
1026 | continue; | 1093 | continue; |
1027 | count += le16_to_cpu(gdp->bg_used_dirs_count); | 1094 | count += ext4_used_dirs_count(sb, gdp); |
1028 | } | 1095 | } |
1029 | return count; | 1096 | return count; |
1030 | } | 1097 | } |
1031 | |||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6702a49992a6..a6444cee0c7e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
72 | * "bh" may be NULL: a metadata block may have been freed from memory | 72 | * "bh" may be NULL: a metadata block may have been freed from memory |
73 | * but there may still be a record of it in the journal, and that record | 73 | * but there may still be a record of it in the journal, and that record |
74 | * still needs to be revoked. | 74 | * still needs to be revoked. |
75 | * | ||
76 | * If the handle isn't valid we're not journaling so there's nothing to do. | ||
75 | */ | 77 | */ |
76 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | 78 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, |
77 | struct buffer_head *bh, ext4_fsblk_t blocknr) | 79 | struct buffer_head *bh, ext4_fsblk_t blocknr) |
78 | { | 80 | { |
79 | int err; | 81 | int err; |
80 | 82 | ||
83 | if (!ext4_handle_valid(handle)) | ||
84 | return 0; | ||
85 | |||
81 | might_sleep(); | 86 | might_sleep(); |
82 | 87 | ||
83 | BUFFER_TRACE(bh, "enter"); | 88 | BUFFER_TRACE(bh, "enter"); |
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode) | |||
170 | */ | 175 | */ |
171 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | 176 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) |
172 | { | 177 | { |
173 | if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) | 178 | if (!ext4_handle_valid(handle)) |
179 | return 0; | ||
180 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
174 | return 0; | 181 | return 0; |
175 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | 182 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) |
176 | return 0; | 183 | return 0; |
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | |||
184 | */ | 191 | */ |
185 | static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | 192 | static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) |
186 | { | 193 | { |
194 | BUG_ON(EXT4_JOURNAL(inode) == NULL); | ||
187 | jbd_debug(2, "restarting handle %p\n", handle); | 195 | jbd_debug(2, "restarting handle %p\n", handle); |
188 | return ext4_journal_restart(handle, blocks_for_truncate(inode)); | 196 | return ext4_journal_restart(handle, blocks_for_truncate(inode)); |
189 | } | 197 | } |
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode) | |||
216 | } | 224 | } |
217 | 225 | ||
218 | if (IS_SYNC(inode)) | 226 | if (IS_SYNC(inode)) |
219 | handle->h_sync = 1; | 227 | ext4_handle_sync(handle); |
220 | inode->i_size = 0; | 228 | inode->i_size = 0; |
221 | err = ext4_mark_inode_dirty(handle, inode); | 229 | err = ext4_mark_inode_dirty(handle, inode); |
222 | if (err) { | 230 | if (err) { |
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode) | |||
233 | * enough credits left in the handle to remove the inode from | 241 | * enough credits left in the handle to remove the inode from |
234 | * the orphan list and set the dtime field. | 242 | * the orphan list and set the dtime field. |
235 | */ | 243 | */ |
236 | if (handle->h_buffer_credits < 3) { | 244 | if (!ext4_handle_has_enough_credits(handle, 3)) { |
237 | err = ext4_journal_extend(handle, 3); | 245 | err = ext4_journal_extend(handle, 3); |
238 | if (err > 0) | 246 | if (err > 0) |
239 | err = ext4_journal_restart(handle, 3); | 247 | err = ext4_journal_restart(handle, 3); |
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | |||
506 | * return the total number of blocks to be allocate, including the | 514 | * return the total number of blocks to be allocate, including the |
507 | * direct and indirect blocks. | 515 | * direct and indirect blocks. |
508 | */ | 516 | */ |
509 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, | 517 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, |
510 | int blocks_to_boundary) | 518 | int blocks_to_boundary) |
511 | { | 519 | { |
512 | unsigned long count = 0; | 520 | unsigned int count = 0; |
513 | 521 | ||
514 | /* | 522 | /* |
515 | * Simple case, [t,d]Indirect block(s) has not allocated yet | 523 | * Simple case, [t,d]Indirect block(s) has not allocated yet |
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
547 | int indirect_blks, int blks, | 555 | int indirect_blks, int blks, |
548 | ext4_fsblk_t new_blocks[4], int *err) | 556 | ext4_fsblk_t new_blocks[4], int *err) |
549 | { | 557 | { |
558 | struct ext4_allocation_request ar; | ||
550 | int target, i; | 559 | int target, i; |
551 | unsigned long count = 0, blk_allocated = 0; | 560 | unsigned long count = 0, blk_allocated = 0; |
552 | int index = 0; | 561 | int index = 0; |
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
595 | if (!target) | 604 | if (!target) |
596 | goto allocated; | 605 | goto allocated; |
597 | /* Now allocate data blocks */ | 606 | /* Now allocate data blocks */ |
598 | count = target; | 607 | memset(&ar, 0, sizeof(ar)); |
599 | /* allocating blocks for data blocks */ | 608 | ar.inode = inode; |
600 | current_block = ext4_new_blocks(handle, inode, iblock, | 609 | ar.goal = goal; |
601 | goal, &count, err); | 610 | ar.len = target; |
611 | ar.logical = iblock; | ||
612 | if (S_ISREG(inode->i_mode)) | ||
613 | /* enable in-core preallocation only for regular files */ | ||
614 | ar.flags = EXT4_MB_HINT_DATA; | ||
615 | |||
616 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
617 | |||
602 | if (*err && (target == blks)) { | 618 | if (*err && (target == blks)) { |
603 | /* | 619 | /* |
604 | * if the allocation failed and we didn't allocate | 620 | * if the allocation failed and we didn't allocate |
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
614 | */ | 630 | */ |
615 | new_blocks[index] = current_block; | 631 | new_blocks[index] = current_block; |
616 | } | 632 | } |
617 | blk_allocated += count; | 633 | blk_allocated += ar.len; |
618 | } | 634 | } |
619 | allocated: | 635 | allocated: |
620 | /* total number of blocks allocated for direct blocks */ | 636 | /* total number of blocks allocated for direct blocks */ |
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
709 | set_buffer_uptodate(bh); | 725 | set_buffer_uptodate(bh); |
710 | unlock_buffer(bh); | 726 | unlock_buffer(bh); |
711 | 727 | ||
712 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 728 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
713 | err = ext4_journal_dirty_metadata(handle, bh); | 729 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
714 | if (err) | 730 | if (err) |
715 | goto failed; | 731 | goto failed; |
716 | } | 732 | } |
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, | |||
792 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | 808 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. |
793 | */ | 809 | */ |
794 | jbd_debug(5, "splicing indirect only\n"); | 810 | jbd_debug(5, "splicing indirect only\n"); |
795 | BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); | 811 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); |
796 | err = ext4_journal_dirty_metadata(handle, where->bh); | 812 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); |
797 | if (err) | 813 | if (err) |
798 | goto err_out; | 814 | goto err_out; |
799 | } else { | 815 | } else { |
@@ -840,10 +856,10 @@ err_out: | |||
840 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block | 856 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block |
841 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) | 857 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) |
842 | */ | 858 | */ |
843 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | 859 | static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, |
844 | ext4_lblk_t iblock, unsigned long maxblocks, | 860 | ext4_lblk_t iblock, unsigned int maxblocks, |
845 | struct buffer_head *bh_result, | 861 | struct buffer_head *bh_result, |
846 | int create, int extend_disksize) | 862 | int create, int extend_disksize) |
847 | { | 863 | { |
848 | int err = -EIO; | 864 | int err = -EIO; |
849 | ext4_lblk_t offsets[4]; | 865 | ext4_lblk_t offsets[4]; |
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) | |||
1045 | * It returns the error in case of allocation failure. | 1061 | * It returns the error in case of allocation failure. |
1046 | */ | 1062 | */ |
1047 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | 1063 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, |
1048 | unsigned long max_blocks, struct buffer_head *bh, | 1064 | unsigned int max_blocks, struct buffer_head *bh, |
1049 | int create, int extend_disksize, int flag) | 1065 | int create, int extend_disksize, int flag) |
1050 | { | 1066 | { |
1051 | int retval; | 1067 | int retval; |
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | |||
1221 | set_buffer_uptodate(bh); | 1237 | set_buffer_uptodate(bh); |
1222 | } | 1238 | } |
1223 | unlock_buffer(bh); | 1239 | unlock_buffer(bh); |
1224 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 1240 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
1225 | err = ext4_journal_dirty_metadata(handle, bh); | 1241 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
1226 | if (!fatal) | 1242 | if (!fatal) |
1227 | fatal = err; | 1243 | fatal = err; |
1228 | } else { | 1244 | } else { |
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, | |||
1335 | pgoff_t index; | 1351 | pgoff_t index; |
1336 | unsigned from, to; | 1352 | unsigned from, to; |
1337 | 1353 | ||
1354 | trace_mark(ext4_write_begin, | ||
1355 | "dev %s ino %lu pos %llu len %u flags %u", | ||
1356 | inode->i_sb->s_id, inode->i_ino, | ||
1357 | (unsigned long long) pos, len, flags); | ||
1338 | index = pos >> PAGE_CACHE_SHIFT; | 1358 | index = pos >> PAGE_CACHE_SHIFT; |
1339 | from = pos & (PAGE_CACHE_SIZE - 1); | 1359 | from = pos & (PAGE_CACHE_SIZE - 1); |
1340 | to = from + len; | 1360 | to = from + len; |
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) | |||
1387 | if (!buffer_mapped(bh) || buffer_freed(bh)) | 1407 | if (!buffer_mapped(bh) || buffer_freed(bh)) |
1388 | return 0; | 1408 | return 0; |
1389 | set_buffer_uptodate(bh); | 1409 | set_buffer_uptodate(bh); |
1390 | return ext4_journal_dirty_metadata(handle, bh); | 1410 | return ext4_handle_dirty_metadata(handle, NULL, bh); |
1391 | } | 1411 | } |
1392 | 1412 | ||
1393 | /* | 1413 | /* |
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file, | |||
1406 | struct inode *inode = mapping->host; | 1426 | struct inode *inode = mapping->host; |
1407 | int ret = 0, ret2; | 1427 | int ret = 0, ret2; |
1408 | 1428 | ||
1429 | trace_mark(ext4_ordered_write_end, | ||
1430 | "dev %s ino %lu pos %llu len %u copied %u", | ||
1431 | inode->i_sb->s_id, inode->i_ino, | ||
1432 | (unsigned long long) pos, len, copied); | ||
1409 | ret = ext4_jbd2_file_inode(handle, inode); | 1433 | ret = ext4_jbd2_file_inode(handle, inode); |
1410 | 1434 | ||
1411 | if (ret == 0) { | 1435 | if (ret == 0) { |
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file, | |||
1444 | int ret = 0, ret2; | 1468 | int ret = 0, ret2; |
1445 | loff_t new_i_size; | 1469 | loff_t new_i_size; |
1446 | 1470 | ||
1471 | trace_mark(ext4_writeback_write_end, | ||
1472 | "dev %s ino %lu pos %llu len %u copied %u", | ||
1473 | inode->i_sb->s_id, inode->i_ino, | ||
1474 | (unsigned long long) pos, len, copied); | ||
1447 | new_i_size = pos + copied; | 1475 | new_i_size = pos + copied; |
1448 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1476 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1449 | ext4_update_i_disksize(inode, new_i_size); | 1477 | ext4_update_i_disksize(inode, new_i_size); |
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file, | |||
1479 | unsigned from, to; | 1507 | unsigned from, to; |
1480 | loff_t new_i_size; | 1508 | loff_t new_i_size; |
1481 | 1509 | ||
1510 | trace_mark(ext4_journalled_write_end, | ||
1511 | "dev %s ino %lu pos %llu len %u copied %u", | ||
1512 | inode->i_sb->s_id, inode->i_ino, | ||
1513 | (unsigned long long) pos, len, copied); | ||
1482 | from = pos & (PAGE_CACHE_SIZE - 1); | 1514 | from = pos & (PAGE_CACHE_SIZE - 1); |
1483 | to = from + len; | 1515 | to = from + len; |
1484 | 1516 | ||
@@ -1625,7 +1657,7 @@ struct mpage_da_data { | |||
1625 | get_block_t *get_block; | 1657 | get_block_t *get_block; |
1626 | struct writeback_control *wbc; | 1658 | struct writeback_control *wbc; |
1627 | int io_done; | 1659 | int io_done; |
1628 | long pages_written; | 1660 | int pages_written; |
1629 | int retval; | 1661 | int retval; |
1630 | }; | 1662 | }; |
1631 | 1663 | ||
@@ -1645,35 +1677,39 @@ struct mpage_da_data { | |||
1645 | */ | 1677 | */ |
1646 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | 1678 | static int mpage_da_submit_io(struct mpage_da_data *mpd) |
1647 | { | 1679 | { |
1648 | struct address_space *mapping = mpd->inode->i_mapping; | ||
1649 | int ret = 0, err, nr_pages, i; | ||
1650 | unsigned long index, end; | ||
1651 | struct pagevec pvec; | ||
1652 | long pages_skipped; | 1680 | long pages_skipped; |
1681 | struct pagevec pvec; | ||
1682 | unsigned long index, end; | ||
1683 | int ret = 0, err, nr_pages, i; | ||
1684 | struct inode *inode = mpd->inode; | ||
1685 | struct address_space *mapping = inode->i_mapping; | ||
1653 | 1686 | ||
1654 | BUG_ON(mpd->next_page <= mpd->first_page); | 1687 | BUG_ON(mpd->next_page <= mpd->first_page); |
1655 | pagevec_init(&pvec, 0); | 1688 | /* |
1689 | * We need to start from the first_page to the next_page - 1 | ||
1690 | * to make sure we also write the mapped dirty buffer_heads. | ||
1691 | * If we look at mpd->lbh.b_blocknr we would only be looking | ||
1692 | * at the currently mapped buffer_heads. | ||
1693 | */ | ||
1656 | index = mpd->first_page; | 1694 | index = mpd->first_page; |
1657 | end = mpd->next_page - 1; | 1695 | end = mpd->next_page - 1; |
1658 | 1696 | ||
1697 | pagevec_init(&pvec, 0); | ||
1659 | while (index <= end) { | 1698 | while (index <= end) { |
1660 | /* | 1699 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); |
1661 | * We can use PAGECACHE_TAG_DIRTY lookup here because | ||
1662 | * even though we have cleared the dirty flag on the page | ||
1663 | * We still keep the page in the radix tree with tag | ||
1664 | * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. | ||
1665 | * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback | ||
1666 | * which is called via the below writepage callback. | ||
1667 | */ | ||
1668 | nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | ||
1669 | PAGECACHE_TAG_DIRTY, | ||
1670 | min(end - index, | ||
1671 | (pgoff_t)PAGEVEC_SIZE-1) + 1); | ||
1672 | if (nr_pages == 0) | 1700 | if (nr_pages == 0) |
1673 | break; | 1701 | break; |
1674 | for (i = 0; i < nr_pages; i++) { | 1702 | for (i = 0; i < nr_pages; i++) { |
1675 | struct page *page = pvec.pages[i]; | 1703 | struct page *page = pvec.pages[i]; |
1676 | 1704 | ||
1705 | index = page->index; | ||
1706 | if (index > end) | ||
1707 | break; | ||
1708 | index++; | ||
1709 | |||
1710 | BUG_ON(!PageLocked(page)); | ||
1711 | BUG_ON(PageWriteback(page)); | ||
1712 | |||
1677 | pages_skipped = mpd->wbc->pages_skipped; | 1713 | pages_skipped = mpd->wbc->pages_skipped; |
1678 | err = mapping->a_ops->writepage(page, mpd->wbc); | 1714 | err = mapping->a_ops->writepage(page, mpd->wbc); |
1679 | if (!err && (pages_skipped == mpd->wbc->pages_skipped)) | 1715 | if (!err && (pages_skipped == mpd->wbc->pages_skipped)) |
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode) | |||
1831 | ext4_count_free_blocks(inode->i_sb)); | 1867 | ext4_count_free_blocks(inode->i_sb)); |
1832 | printk(KERN_EMERG "Free/Dirty block details\n"); | 1868 | printk(KERN_EMERG "Free/Dirty block details\n"); |
1833 | printk(KERN_EMERG "free_blocks=%lld\n", | 1869 | printk(KERN_EMERG "free_blocks=%lld\n", |
1834 | percpu_counter_sum(&sbi->s_freeblocks_counter)); | 1870 | (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); |
1835 | printk(KERN_EMERG "dirty_blocks=%lld\n", | 1871 | printk(KERN_EMERG "dirty_blocks=%lld\n", |
1836 | percpu_counter_sum(&sbi->s_dirtyblocks_counter)); | 1872 | (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); |
1837 | printk(KERN_EMERG "Block reservation details\n"); | 1873 | printk(KERN_EMERG "Block reservation details\n"); |
1838 | printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", | 1874 | printk(KERN_EMERG "i_reserved_data_blocks=%u\n", |
1839 | EXT4_I(inode)->i_reserved_data_blocks); | 1875 | EXT4_I(inode)->i_reserved_data_blocks); |
1840 | printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", | 1876 | printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", |
1841 | EXT4_I(inode)->i_reserved_meta_blocks); | 1877 | EXT4_I(inode)->i_reserved_meta_blocks); |
1842 | return; | 1878 | return; |
1843 | } | 1879 | } |
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page, | |||
2087 | bh = head; | 2123 | bh = head; |
2088 | do { | 2124 | do { |
2089 | BUG_ON(buffer_locked(bh)); | 2125 | BUG_ON(buffer_locked(bh)); |
2126 | /* | ||
2127 | * We need to try to allocate | ||
2128 | * unmapped blocks in the same page. | ||
2129 | * Otherwise we won't make progress | ||
2130 | * with the page in ext4_da_writepage | ||
2131 | */ | ||
2090 | if (buffer_dirty(bh) && | 2132 | if (buffer_dirty(bh) && |
2091 | (!buffer_mapped(bh) || buffer_delay(bh))) { | 2133 | (!buffer_mapped(bh) || buffer_delay(bh))) { |
2092 | mpage_add_bh_to_extent(mpd, logical, bh); | 2134 | mpage_add_bh_to_extent(mpd, logical, bh); |
2093 | if (mpd->io_done) | 2135 | if (mpd->io_done) |
2094 | return MPAGE_DA_EXTENT_TAIL; | 2136 | return MPAGE_DA_EXTENT_TAIL; |
2137 | } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { | ||
2138 | /* | ||
2139 | * mapped dirty buffer. We need to update | ||
2140 | * the b_state because we look at | ||
2141 | * b_state in mpage_da_map_blocks. We don't | ||
2142 | * update b_size because if we find an | ||
2143 | * unmapped buffer_head later we need to | ||
2144 | * use the b_state flag of that buffer_head. | ||
2145 | */ | ||
2146 | if (mpd->lbh.b_size == 0) | ||
2147 | mpd->lbh.b_state = | ||
2148 | bh->b_state & BH_FLAGS; | ||
2095 | } | 2149 | } |
2096 | logical++; | 2150 | logical++; |
2097 | } while ((bh = bh->b_this_page) != head); | 2151 | } while ((bh = bh->b_this_page) != head); |
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page, | |||
2269 | { | 2323 | { |
2270 | int ret = 0; | 2324 | int ret = 0; |
2271 | loff_t size; | 2325 | loff_t size; |
2272 | unsigned long len; | 2326 | unsigned int len; |
2273 | struct buffer_head *page_bufs; | 2327 | struct buffer_head *page_bufs; |
2274 | struct inode *inode = page->mapping->host; | 2328 | struct inode *inode = page->mapping->host; |
2275 | 2329 | ||
2330 | trace_mark(ext4_da_writepage, | ||
2331 | "dev %s ino %lu page_index %lu", | ||
2332 | inode->i_sb->s_id, inode->i_ino, page->index); | ||
2276 | size = i_size_read(inode); | 2333 | size = i_size_read(inode); |
2277 | if (page->index == size >> PAGE_CACHE_SHIFT) | 2334 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2278 | len = size & ~PAGE_CACHE_MASK; | 2335 | len = size & ~PAGE_CACHE_MASK; |
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2378 | struct mpage_da_data mpd; | 2435 | struct mpage_da_data mpd; |
2379 | struct inode *inode = mapping->host; | 2436 | struct inode *inode = mapping->host; |
2380 | int no_nrwrite_index_update; | 2437 | int no_nrwrite_index_update; |
2381 | long pages_written = 0, pages_skipped; | 2438 | int pages_written = 0; |
2439 | long pages_skipped; | ||
2382 | int needed_blocks, ret = 0, nr_to_writebump = 0; | 2440 | int needed_blocks, ret = 0, nr_to_writebump = 0; |
2383 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | 2441 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
2384 | 2442 | ||
2443 | trace_mark(ext4_da_writepages, | ||
2444 | "dev %s ino %lu nr_t_write %ld " | ||
2445 | "pages_skipped %ld range_start %llu " | ||
2446 | "range_end %llu nonblocking %d " | ||
2447 | "for_kupdate %d for_reclaim %d " | ||
2448 | "for_writepages %d range_cyclic %d", | ||
2449 | inode->i_sb->s_id, inode->i_ino, | ||
2450 | wbc->nr_to_write, wbc->pages_skipped, | ||
2451 | (unsigned long long) wbc->range_start, | ||
2452 | (unsigned long long) wbc->range_end, | ||
2453 | wbc->nonblocking, wbc->for_kupdate, | ||
2454 | wbc->for_reclaim, wbc->for_writepages, | ||
2455 | wbc->range_cyclic); | ||
2456 | |||
2385 | /* | 2457 | /* |
2386 | * No pages to write? This is mainly a kludge to avoid starting | 2458 | * No pages to write? This is mainly a kludge to avoid starting |
2387 | * a transaction for special inodes like journal inode on last iput() | 2459 | * a transaction for special inodes like journal inode on last iput() |
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2389 | */ | 2461 | */ |
2390 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | 2462 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
2391 | return 0; | 2463 | return 0; |
2464 | |||
2465 | /* | ||
2466 | * If the filesystem has aborted, it is read-only, so return | ||
2467 | * right away instead of dumping stack traces later on that | ||
2468 | * will obscure the real source of the problem. We test | ||
2469 | * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because | ||
2470 | * the latter could be true if the filesystem is mounted | ||
2471 | * read-only, and in that case, ext4_da_writepages should | ||
2472 | * *never* be called, so if that ever happens, we would want | ||
2473 | * the stack trace. | ||
2474 | */ | ||
2475 | if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) | ||
2476 | return -EROFS; | ||
2477 | |||
2392 | /* | 2478 | /* |
2393 | * Make sure nr_to_write is >= sbi->s_mb_stream_request | 2479 | * Make sure nr_to_write is >= sbi->s_mb_stream_request |
2394 | * This make sure small files blocks are allocated in | 2480 | * This make sure small files blocks are allocated in |
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2433 | handle = ext4_journal_start(inode, needed_blocks); | 2519 | handle = ext4_journal_start(inode, needed_blocks); |
2434 | if (IS_ERR(handle)) { | 2520 | if (IS_ERR(handle)) { |
2435 | ret = PTR_ERR(handle); | 2521 | ret = PTR_ERR(handle); |
2436 | printk(KERN_EMERG "%s: jbd2_start: " | 2522 | printk(KERN_CRIT "%s: jbd2_start: " |
2437 | "%ld pages, ino %lu; err %d\n", __func__, | 2523 | "%ld pages, ino %lu; err %d\n", __func__, |
2438 | wbc->nr_to_write, inode->i_ino, ret); | 2524 | wbc->nr_to_write, inode->i_ino, ret); |
2439 | dump_stack(); | 2525 | dump_stack(); |
@@ -2486,6 +2572,14 @@ out_writepages: | |||
2486 | if (!no_nrwrite_index_update) | 2572 | if (!no_nrwrite_index_update) |
2487 | wbc->no_nrwrite_index_update = 0; | 2573 | wbc->no_nrwrite_index_update = 0; |
2488 | wbc->nr_to_write -= nr_to_writebump; | 2574 | wbc->nr_to_write -= nr_to_writebump; |
2575 | trace_mark(ext4_da_writepage_result, | ||
2576 | "dev %s ino %lu ret %d pages_written %d " | ||
2577 | "pages_skipped %ld congestion %d " | ||
2578 | "more_io %d no_nrwrite_index_update %d", | ||
2579 | inode->i_sb->s_id, inode->i_ino, ret, | ||
2580 | pages_written, wbc->pages_skipped, | ||
2581 | wbc->encountered_congestion, wbc->more_io, | ||
2582 | wbc->no_nrwrite_index_update); | ||
2489 | return ret; | 2583 | return ret; |
2490 | } | 2584 | } |
2491 | 2585 | ||
@@ -2498,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2498 | /* | 2592 | /* |
2499 | * switch to non delalloc mode if we are running low | 2593 | * switch to non delalloc mode if we are running low |
2500 | * on free block. The free block accounting via percpu | 2594 | * on free block. The free block accounting via percpu |
2501 | * counters can get slightly wrong with FBC_BATCH getting | 2595 | * counters can get slightly wrong with percpu_counter_batch getting |
2502 | * accumulated on each CPU without updating global counters | 2596 | * accumulated on each CPU without updating global counters |
2503 | * Delalloc need an accurate free block accounting. So switch | 2597 | * Delalloc need an accurate free block accounting. So switch |
2504 | * to non delalloc when we are near to error range. | 2598 | * to non delalloc when we are near to error range. |
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | |||
2537 | len, flags, pagep, fsdata); | 2631 | len, flags, pagep, fsdata); |
2538 | } | 2632 | } |
2539 | *fsdata = (void *)0; | 2633 | *fsdata = (void *)0; |
2634 | |||
2635 | trace_mark(ext4_da_write_begin, | ||
2636 | "dev %s ino %lu pos %llu len %u flags %u", | ||
2637 | inode->i_sb->s_id, inode->i_ino, | ||
2638 | (unsigned long long) pos, len, flags); | ||
2540 | retry: | 2639 | retry: |
2541 | /* | 2640 | /* |
2542 | * With delayed allocation, we don't log the i_disksize update | 2641 | * With delayed allocation, we don't log the i_disksize update |
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file, | |||
2626 | } | 2725 | } |
2627 | } | 2726 | } |
2628 | 2727 | ||
2728 | trace_mark(ext4_da_write_end, | ||
2729 | "dev %s ino %lu pos %llu len %u copied %u", | ||
2730 | inode->i_sb->s_id, inode->i_ino, | ||
2731 | (unsigned long long) pos, len, copied); | ||
2629 | start = pos & (PAGE_CACHE_SIZE - 1); | 2732 | start = pos & (PAGE_CACHE_SIZE - 1); |
2630 | end = start + copied - 1; | 2733 | end = start + copied - 1; |
2631 | 2734 | ||
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
2718 | filemap_write_and_wait(mapping); | 2821 | filemap_write_and_wait(mapping); |
2719 | } | 2822 | } |
2720 | 2823 | ||
2721 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | 2824 | BUG_ON(!EXT4_JOURNAL(inode) && |
2825 | EXT4_I(inode)->i_state & EXT4_STATE_JDATA); | ||
2826 | |||
2827 | if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | ||
2722 | /* | 2828 | /* |
2723 | * This is a REALLY heavyweight approach, but the use of | 2829 | * This is a REALLY heavyweight approach, but the use of |
2724 | * bmap on dirty files is expected to be extremely rare: | 2830 | * bmap on dirty files is expected to be extremely rare: |
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page, | |||
2836 | loff_t size = i_size_read(inode); | 2942 | loff_t size = i_size_read(inode); |
2837 | loff_t len; | 2943 | loff_t len; |
2838 | 2944 | ||
2945 | trace_mark(ext4_normal_writepage, | ||
2946 | "dev %s ino %lu page_index %lu", | ||
2947 | inode->i_sb->s_id, inode->i_ino, page->index); | ||
2839 | J_ASSERT(PageLocked(page)); | 2948 | J_ASSERT(PageLocked(page)); |
2840 | if (page->index == size >> PAGE_CACHE_SHIFT) | 2949 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2841 | len = size & ~PAGE_CACHE_MASK; | 2950 | len = size & ~PAGE_CACHE_MASK; |
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page, | |||
2921 | loff_t size = i_size_read(inode); | 3030 | loff_t size = i_size_read(inode); |
2922 | loff_t len; | 3031 | loff_t len; |
2923 | 3032 | ||
3033 | trace_mark(ext4_journalled_writepage, | ||
3034 | "dev %s ino %lu page_index %lu", | ||
3035 | inode->i_sb->s_id, inode->i_ino, page->index); | ||
2924 | J_ASSERT(PageLocked(page)); | 3036 | J_ASSERT(PageLocked(page)); |
2925 | if (page->index == size >> PAGE_CACHE_SHIFT) | 3037 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2926 | len = size & ~PAGE_CACHE_MASK; | 3038 | len = size & ~PAGE_CACHE_MASK; |
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) | |||
2989 | if (offset == 0) | 3101 | if (offset == 0) |
2990 | ClearPageChecked(page); | 3102 | ClearPageChecked(page); |
2991 | 3103 | ||
2992 | jbd2_journal_invalidatepage(journal, page, offset); | 3104 | if (journal) |
3105 | jbd2_journal_invalidatepage(journal, page, offset); | ||
3106 | else | ||
3107 | block_invalidatepage(page, offset); | ||
2993 | } | 3108 | } |
2994 | 3109 | ||
2995 | static int ext4_releasepage(struct page *page, gfp_t wait) | 3110 | static int ext4_releasepage(struct page *page, gfp_t wait) |
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
2999 | WARN_ON(PageChecked(page)); | 3114 | WARN_ON(PageChecked(page)); |
3000 | if (!page_has_buffers(page)) | 3115 | if (!page_has_buffers(page)) |
3001 | return 0; | 3116 | return 0; |
3002 | return jbd2_journal_try_to_free_buffers(journal, page, wait); | 3117 | if (journal) |
3118 | return jbd2_journal_try_to_free_buffers(journal, page, wait); | ||
3119 | else | ||
3120 | return try_to_free_buffers(page); | ||
3003 | } | 3121 | } |
3004 | 3122 | ||
3005 | /* | 3123 | /* |
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle, | |||
3271 | 3389 | ||
3272 | err = 0; | 3390 | err = 0; |
3273 | if (ext4_should_journal_data(inode)) { | 3391 | if (ext4_should_journal_data(inode)) { |
3274 | err = ext4_journal_dirty_metadata(handle, bh); | 3392 | err = ext4_handle_dirty_metadata(handle, inode, bh); |
3275 | } else { | 3393 | } else { |
3276 | if (ext4_should_order_data(inode)) | 3394 | if (ext4_should_order_data(inode)) |
3277 | err = ext4_jbd2_file_inode(handle, inode); | 3395 | err = ext4_jbd2_file_inode(handle, inode); |
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | |||
3395 | __le32 *p; | 3513 | __le32 *p; |
3396 | if (try_to_extend_transaction(handle, inode)) { | 3514 | if (try_to_extend_transaction(handle, inode)) { |
3397 | if (bh) { | 3515 | if (bh) { |
3398 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 3516 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
3399 | ext4_journal_dirty_metadata(handle, bh); | 3517 | ext4_handle_dirty_metadata(handle, inode, bh); |
3400 | } | 3518 | } |
3401 | ext4_mark_inode_dirty(handle, inode); | 3519 | ext4_mark_inode_dirty(handle, inode); |
3402 | ext4_journal_test_restart(handle, inode); | 3520 | ext4_journal_test_restart(handle, inode); |
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |||
3496 | count, block_to_free_p, p); | 3614 | count, block_to_free_p, p); |
3497 | 3615 | ||
3498 | if (this_bh) { | 3616 | if (this_bh) { |
3499 | BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); | 3617 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); |
3500 | 3618 | ||
3501 | /* | 3619 | /* |
3502 | * The buffer head should have an attached journal head at this | 3620 | * The buffer head should have an attached journal head at this |
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |||
3505 | * the block was cleared. Check for this instead of OOPSing. | 3623 | * the block was cleared. Check for this instead of OOPSing. |
3506 | */ | 3624 | */ |
3507 | if (bh2jh(this_bh)) | 3625 | if (bh2jh(this_bh)) |
3508 | ext4_journal_dirty_metadata(handle, this_bh); | 3626 | ext4_handle_dirty_metadata(handle, inode, this_bh); |
3509 | else | 3627 | else |
3510 | ext4_error(inode->i_sb, __func__, | 3628 | ext4_error(inode->i_sb, __func__, |
3511 | "circular indirect block detected, " | 3629 | "circular indirect block detected, " |
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3535 | ext4_fsblk_t nr; | 3653 | ext4_fsblk_t nr; |
3536 | __le32 *p; | 3654 | __le32 *p; |
3537 | 3655 | ||
3538 | if (is_handle_aborted(handle)) | 3656 | if (ext4_handle_is_aborted(handle)) |
3539 | return; | 3657 | return; |
3540 | 3658 | ||
3541 | if (depth--) { | 3659 | if (depth--) { |
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3605 | * will merely complain about releasing a free block, | 3723 | * will merely complain about releasing a free block, |
3606 | * rather than leaking blocks. | 3724 | * rather than leaking blocks. |
3607 | */ | 3725 | */ |
3608 | if (is_handle_aborted(handle)) | 3726 | if (ext4_handle_is_aborted(handle)) |
3609 | return; | 3727 | return; |
3610 | if (try_to_extend_transaction(handle, inode)) { | 3728 | if (try_to_extend_transaction(handle, inode)) { |
3611 | ext4_mark_inode_dirty(handle, inode); | 3729 | ext4_mark_inode_dirty(handle, inode); |
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
3624 | parent_bh)){ | 3742 | parent_bh)){ |
3625 | *p = 0; | 3743 | *p = 0; |
3626 | BUFFER_TRACE(parent_bh, | 3744 | BUFFER_TRACE(parent_bh, |
3627 | "call ext4_journal_dirty_metadata"); | 3745 | "call ext4_handle_dirty_metadata"); |
3628 | ext4_journal_dirty_metadata(handle, | 3746 | ext4_handle_dirty_metadata(handle, |
3629 | parent_bh); | 3747 | inode, |
3748 | parent_bh); | ||
3630 | } | 3749 | } |
3631 | } | 3750 | } |
3632 | } | 3751 | } |
@@ -3814,7 +3933,7 @@ do_indirects: | |||
3814 | * synchronous | 3933 | * synchronous |
3815 | */ | 3934 | */ |
3816 | if (IS_SYNC(inode)) | 3935 | if (IS_SYNC(inode)) |
3817 | handle->h_sync = 1; | 3936 | ext4_handle_sync(handle); |
3818 | out_stop: | 3937 | out_stop: |
3819 | /* | 3938 | /* |
3820 | * If this was a simple ftruncate(), and the file will remain alive | 3939 | * If this was a simple ftruncate(), and the file will remain alive |
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
3844 | ext4_fsblk_t block; | 3963 | ext4_fsblk_t block; |
3845 | int inodes_per_block, inode_offset; | 3964 | int inodes_per_block, inode_offset; |
3846 | 3965 | ||
3847 | iloc->bh = 0; | 3966 | iloc->bh = NULL; |
3848 | if (!ext4_valid_inum(sb, inode->i_ino)) | 3967 | if (!ext4_valid_inum(sb, inode->i_ino)) |
3849 | return -EIO; | 3968 | return -EIO; |
3850 | 3969 | ||
@@ -3951,7 +4070,7 @@ make_io: | |||
3951 | num = EXT4_INODES_PER_GROUP(sb); | 4070 | num = EXT4_INODES_PER_GROUP(sb); |
3952 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 4071 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
3953 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) | 4072 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) |
3954 | num -= le16_to_cpu(gdp->bg_itable_unused); | 4073 | num -= ext4_itable_unused_count(sb, gdp); |
3955 | table += num / inodes_per_block; | 4074 | table += num / inodes_per_block; |
3956 | if (end > table) | 4075 | if (end > table) |
3957 | end = table; | 4076 | end = table; |
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4313 | EXT4_SET_RO_COMPAT_FEATURE(sb, | 4432 | EXT4_SET_RO_COMPAT_FEATURE(sb, |
4314 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); | 4433 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); |
4315 | sb->s_dirt = 1; | 4434 | sb->s_dirt = 1; |
4316 | handle->h_sync = 1; | 4435 | ext4_handle_sync(handle); |
4317 | err = ext4_journal_dirty_metadata(handle, | 4436 | err = ext4_handle_dirty_metadata(handle, inode, |
4318 | EXT4_SB(sb)->s_sbh); | 4437 | EXT4_SB(sb)->s_sbh); |
4319 | } | 4438 | } |
4320 | } | 4439 | } |
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle, | |||
4341 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); | 4460 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); |
4342 | } | 4461 | } |
4343 | 4462 | ||
4344 | 4463 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | |
4345 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 4464 | rc = ext4_handle_dirty_metadata(handle, inode, bh); |
4346 | rc = ext4_journal_dirty_metadata(handle, bh); | ||
4347 | if (!err) | 4465 | if (!err) |
4348 | err = rc; | 4466 | err = rc; |
4349 | ei->i_state &= ~EXT4_STATE_NEW; | 4467 | ei->i_state &= ~EXT4_STATE_NEW; |
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait) | |||
4406 | return ext4_force_commit(inode->i_sb); | 4524 | return ext4_force_commit(inode->i_sb); |
4407 | } | 4525 | } |
4408 | 4526 | ||
4527 | int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) | ||
4528 | { | ||
4529 | int err = 0; | ||
4530 | |||
4531 | mark_buffer_dirty(bh); | ||
4532 | if (inode && inode_needs_sync(inode)) { | ||
4533 | sync_dirty_buffer(bh); | ||
4534 | if (buffer_req(bh) && !buffer_uptodate(bh)) { | ||
4535 | ext4_error(inode->i_sb, __func__, | ||
4536 | "IO error syncing inode, " | ||
4537 | "inode=%lu, block=%llu", | ||
4538 | inode->i_ino, | ||
4539 | (unsigned long long)bh->b_blocknr); | ||
4540 | err = -EIO; | ||
4541 | } | ||
4542 | } | ||
4543 | return err; | ||
4544 | } | ||
4545 | |||
4409 | /* | 4546 | /* |
4410 | * ext4_setattr() | 4547 | * ext4_setattr() |
4411 | * | 4548 | * |
@@ -4710,16 +4847,15 @@ int | |||
4710 | ext4_reserve_inode_write(handle_t *handle, struct inode *inode, | 4847 | ext4_reserve_inode_write(handle_t *handle, struct inode *inode, |
4711 | struct ext4_iloc *iloc) | 4848 | struct ext4_iloc *iloc) |
4712 | { | 4849 | { |
4713 | int err = 0; | 4850 | int err; |
4714 | if (handle) { | 4851 | |
4715 | err = ext4_get_inode_loc(inode, iloc); | 4852 | err = ext4_get_inode_loc(inode, iloc); |
4716 | if (!err) { | 4853 | if (!err) { |
4717 | BUFFER_TRACE(iloc->bh, "get_write_access"); | 4854 | BUFFER_TRACE(iloc->bh, "get_write_access"); |
4718 | err = ext4_journal_get_write_access(handle, iloc->bh); | 4855 | err = ext4_journal_get_write_access(handle, iloc->bh); |
4719 | if (err) { | 4856 | if (err) { |
4720 | brelse(iloc->bh); | 4857 | brelse(iloc->bh); |
4721 | iloc->bh = NULL; | 4858 | iloc->bh = NULL; |
4722 | } | ||
4723 | } | 4859 | } |
4724 | } | 4860 | } |
4725 | ext4_std_error(inode->i_sb, err); | 4861 | ext4_std_error(inode->i_sb, err); |
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
4791 | 4927 | ||
4792 | might_sleep(); | 4928 | might_sleep(); |
4793 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 4929 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
4794 | if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && | 4930 | if (ext4_handle_valid(handle) && |
4931 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && | ||
4795 | !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { | 4932 | !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { |
4796 | /* | 4933 | /* |
4797 | * We need extra buffer credits since we may write into EA block | 4934 | * We need extra buffer credits since we may write into EA block |
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode) | |||
4843 | handle_t *current_handle = ext4_journal_current_handle(); | 4980 | handle_t *current_handle = ext4_journal_current_handle(); |
4844 | handle_t *handle; | 4981 | handle_t *handle; |
4845 | 4982 | ||
4983 | if (!ext4_handle_valid(current_handle)) { | ||
4984 | ext4_mark_inode_dirty(current_handle, inode); | ||
4985 | return; | ||
4986 | } | ||
4987 | |||
4846 | handle = ext4_journal_start(inode, 2); | 4988 | handle = ext4_journal_start(inode, 2); |
4847 | if (IS_ERR(handle)) | 4989 | if (IS_ERR(handle)) |
4848 | goto out; | 4990 | goto out; |
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) | |||
4880 | BUFFER_TRACE(iloc.bh, "get_write_access"); | 5022 | BUFFER_TRACE(iloc.bh, "get_write_access"); |
4881 | err = jbd2_journal_get_write_access(handle, iloc.bh); | 5023 | err = jbd2_journal_get_write_access(handle, iloc.bh); |
4882 | if (!err) | 5024 | if (!err) |
4883 | err = ext4_journal_dirty_metadata(handle, | 5025 | err = ext4_handle_dirty_metadata(handle, |
4884 | iloc.bh); | 5026 | inode, |
5027 | iloc.bh); | ||
4885 | brelse(iloc.bh); | 5028 | brelse(iloc.bh); |
4886 | } | 5029 | } |
4887 | } | 5030 | } |
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
4907 | */ | 5050 | */ |
4908 | 5051 | ||
4909 | journal = EXT4_JOURNAL(inode); | 5052 | journal = EXT4_JOURNAL(inode); |
5053 | if (!journal) | ||
5054 | return 0; | ||
4910 | if (is_journal_aborted(journal)) | 5055 | if (is_journal_aborted(journal)) |
4911 | return -EROFS; | 5056 | return -EROFS; |
4912 | 5057 | ||
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
4936 | return PTR_ERR(handle); | 5081 | return PTR_ERR(handle); |
4937 | 5082 | ||
4938 | err = ext4_mark_inode_dirty(handle, inode); | 5083 | err = ext4_mark_inode_dirty(handle, inode); |
4939 | handle->h_sync = 1; | 5084 | ext4_handle_sync(handle); |
4940 | ext4_journal_stop(handle); | 5085 | ext4_journal_stop(handle); |
4941 | ext4_std_error(inode->i_sb, err); | 5086 | ext4_std_error(inode->i_sb, err); |
4942 | 5087 | ||
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index dc99b4776d58..42dc83fb247a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
99 | goto flags_out; | 99 | goto flags_out; |
100 | } | 100 | } |
101 | if (IS_SYNC(inode)) | 101 | if (IS_SYNC(inode)) |
102 | handle->h_sync = 1; | 102 | ext4_handle_sync(handle); |
103 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 103 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
104 | if (err) | 104 | if (err) |
105 | goto flags_err; | 105 | goto flags_err; |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 444ad998f72e..918aec0c8a11 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -100,7 +100,7 @@ | |||
100 | * inode as: | 100 | * inode as: |
101 | * | 101 | * |
102 | * { page } | 102 | * { page } |
103 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | 103 | * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... |
104 | * | 104 | * |
105 | * | 105 | * |
106 | * one block each for bitmap and buddy information. So for each group we | 106 | * one block each for bitmap and buddy information. So for each group we |
@@ -330,6 +330,18 @@ | |||
330 | * object | 330 | * object |
331 | * | 331 | * |
332 | */ | 332 | */ |
333 | static struct kmem_cache *ext4_pspace_cachep; | ||
334 | static struct kmem_cache *ext4_ac_cachep; | ||
335 | static struct kmem_cache *ext4_free_ext_cachep; | ||
336 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
337 | ext4_group_t group); | ||
338 | static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | ||
339 | ext4_group_t group); | ||
340 | static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
341 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
342 | static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); | ||
343 | |||
344 | |||
333 | 345 | ||
334 | static inline void *mb_correct_addr_and_bit(int *bit, void *addr) | 346 | static inline void *mb_correct_addr_and_bit(int *bit, void *addr) |
335 | { | 347 | { |
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, | |||
445 | blocknr += first + i; | 457 | blocknr += first + i; |
446 | blocknr += | 458 | blocknr += |
447 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | 459 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
448 | 460 | ext4_grp_locked_error(sb, e4b->bd_group, | |
449 | ext4_error(sb, __func__, "double-free of inode" | 461 | __func__, "double-free of inode" |
450 | " %lu's block %llu(bit %u in group %lu)\n", | 462 | " %lu's block %llu(bit %u in group %u)", |
451 | inode ? inode->i_ino : 0, blocknr, | 463 | inode ? inode->i_ino : 0, blocknr, |
452 | first + i, e4b->bd_group); | 464 | first + i, e4b->bd_group); |
453 | } | 465 | } |
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | |||
477 | b2 = (unsigned char *) bitmap; | 489 | b2 = (unsigned char *) bitmap; |
478 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | 490 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { |
479 | if (b1[i] != b2[i]) { | 491 | if (b1[i] != b2[i]) { |
480 | printk(KERN_ERR "corruption in group %lu " | 492 | printk(KERN_ERR "corruption in group %u " |
481 | "at byte %u(%u): %x in copy != %x " | 493 | "at byte %u(%u): %x in copy != %x " |
482 | "on disk/prealloc\n", | 494 | "on disk/prealloc\n", |
483 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | 495 | e4b->bd_group, i, i * 8, b1[i], b2[i]); |
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb, | |||
690 | grp->bb_fragments = fragments; | 702 | grp->bb_fragments = fragments; |
691 | 703 | ||
692 | if (free != grp->bb_free) { | 704 | if (free != grp->bb_free) { |
693 | ext4_error(sb, __func__, | 705 | ext4_grp_locked_error(sb, group, __func__, |
694 | "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", | 706 | "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", |
695 | group, free, grp->bb_free); | 707 | group, free, grp->bb_free); |
696 | /* | 708 | /* |
697 | * If we intent to continue, we consider group descritor | 709 | * If we intent to continue, we consider group descritor |
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, | |||
716 | * stored in the inode as | 728 | * stored in the inode as |
717 | * | 729 | * |
718 | * { page } | 730 | * { page } |
719 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | 731 | * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... |
720 | * | 732 | * |
721 | * | 733 | * |
722 | * one block each for bitmap and buddy information. | 734 | * one block each for bitmap and buddy information. |
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
782 | if (bh[i] == NULL) | 794 | if (bh[i] == NULL) |
783 | goto out; | 795 | goto out; |
784 | 796 | ||
785 | if (buffer_uptodate(bh[i]) && | 797 | if (bitmap_uptodate(bh[i])) |
786 | !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) | ||
787 | continue; | 798 | continue; |
788 | 799 | ||
789 | lock_buffer(bh[i]); | 800 | lock_buffer(bh[i]); |
801 | if (bitmap_uptodate(bh[i])) { | ||
802 | unlock_buffer(bh[i]); | ||
803 | continue; | ||
804 | } | ||
790 | spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | 805 | spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); |
791 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 806 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
792 | ext4_init_block_bitmap(sb, bh[i], | 807 | ext4_init_block_bitmap(sb, bh[i], |
793 | first_group + i, desc); | 808 | first_group + i, desc); |
809 | set_bitmap_uptodate(bh[i]); | ||
794 | set_buffer_uptodate(bh[i]); | 810 | set_buffer_uptodate(bh[i]); |
795 | unlock_buffer(bh[i]); | ||
796 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | 811 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); |
812 | unlock_buffer(bh[i]); | ||
797 | continue; | 813 | continue; |
798 | } | 814 | } |
799 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | 815 | spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); |
816 | if (buffer_uptodate(bh[i])) { | ||
817 | /* | ||
818 | * if not uninit if bh is uptodate, | ||
819 | * bitmap is also uptodate | ||
820 | */ | ||
821 | set_bitmap_uptodate(bh[i]); | ||
822 | unlock_buffer(bh[i]); | ||
823 | continue; | ||
824 | } | ||
800 | get_bh(bh[i]); | 825 | get_bh(bh[i]); |
826 | /* | ||
827 | * submit the buffer_head for read. We can | ||
828 | * safely mark the bitmap as uptodate now. | ||
829 | * We do it here so the bitmap uptodate bit | ||
830 | * get set with buffer lock held. | ||
831 | */ | ||
832 | set_bitmap_uptodate(bh[i]); | ||
801 | bh[i]->b_end_io = end_buffer_read_sync; | 833 | bh[i]->b_end_io = end_buffer_read_sync; |
802 | submit_bh(READ, bh[i]); | 834 | submit_bh(READ, bh[i]); |
803 | mb_debug("read bitmap for group %lu\n", first_group + i); | 835 | mb_debug("read bitmap for group %u\n", first_group + i); |
804 | } | 836 | } |
805 | 837 | ||
806 | /* wait for I/O completion */ | 838 | /* wait for I/O completion */ |
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
814 | 846 | ||
815 | err = 0; | 847 | err = 0; |
816 | first_block = page->index * blocks_per_page; | 848 | first_block = page->index * blocks_per_page; |
849 | /* init the page */ | ||
850 | memset(page_address(page), 0xff, PAGE_CACHE_SIZE); | ||
817 | for (i = 0; i < blocks_per_page; i++) { | 851 | for (i = 0; i < blocks_per_page; i++) { |
818 | int group; | 852 | int group; |
819 | struct ext4_group_info *grinfo; | 853 | struct ext4_group_info *grinfo; |
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
840 | BUG_ON(incore == NULL); | 874 | BUG_ON(incore == NULL); |
841 | mb_debug("put buddy for group %u in page %lu/%x\n", | 875 | mb_debug("put buddy for group %u in page %lu/%x\n", |
842 | group, page->index, i * blocksize); | 876 | group, page->index, i * blocksize); |
843 | memset(data, 0xff, blocksize); | ||
844 | grinfo = ext4_get_group_info(sb, group); | 877 | grinfo = ext4_get_group_info(sb, group); |
845 | grinfo->bb_fragments = 0; | 878 | grinfo->bb_fragments = 0; |
846 | memset(grinfo->bb_counters, 0, | 879 | memset(grinfo->bb_counters, 0, |
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
848 | /* | 881 | /* |
849 | * incore got set to the group block bitmap below | 882 | * incore got set to the group block bitmap below |
850 | */ | 883 | */ |
884 | ext4_lock_group(sb, group); | ||
851 | ext4_mb_generate_buddy(sb, data, incore, group); | 885 | ext4_mb_generate_buddy(sb, data, incore, group); |
886 | ext4_unlock_group(sb, group); | ||
852 | incore = NULL; | 887 | incore = NULL; |
853 | } else { | 888 | } else { |
854 | /* this is block of bitmap */ | 889 | /* this is block of bitmap */ |
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
862 | 897 | ||
863 | /* mark all preallocated blks used in in-core bitmap */ | 898 | /* mark all preallocated blks used in in-core bitmap */ |
864 | ext4_mb_generate_from_pa(sb, data, group); | 899 | ext4_mb_generate_from_pa(sb, data, group); |
900 | ext4_mb_generate_from_freelist(sb, data, group); | ||
865 | ext4_unlock_group(sb, group); | 901 | ext4_unlock_group(sb, group); |
866 | 902 | ||
867 | /* set incore so that the buddy information can be | 903 | /* set incore so that the buddy information can be |
@@ -886,18 +922,20 @@ static noinline_for_stack int | |||
886 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | 922 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, |
887 | struct ext4_buddy *e4b) | 923 | struct ext4_buddy *e4b) |
888 | { | 924 | { |
889 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
890 | struct inode *inode = sbi->s_buddy_cache; | ||
891 | int blocks_per_page; | 925 | int blocks_per_page; |
892 | int block; | 926 | int block; |
893 | int pnum; | 927 | int pnum; |
894 | int poff; | 928 | int poff; |
895 | struct page *page; | 929 | struct page *page; |
896 | int ret; | 930 | int ret; |
931 | struct ext4_group_info *grp; | ||
932 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
933 | struct inode *inode = sbi->s_buddy_cache; | ||
897 | 934 | ||
898 | mb_debug("load group %lu\n", group); | 935 | mb_debug("load group %u\n", group); |
899 | 936 | ||
900 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | 937 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; |
938 | grp = ext4_get_group_info(sb, group); | ||
901 | 939 | ||
902 | e4b->bd_blkbits = sb->s_blocksize_bits; | 940 | e4b->bd_blkbits = sb->s_blocksize_bits; |
903 | e4b->bd_info = ext4_get_group_info(sb, group); | 941 | e4b->bd_info = ext4_get_group_info(sb, group); |
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
905 | e4b->bd_group = group; | 943 | e4b->bd_group = group; |
906 | e4b->bd_buddy_page = NULL; | 944 | e4b->bd_buddy_page = NULL; |
907 | e4b->bd_bitmap_page = NULL; | 945 | e4b->bd_bitmap_page = NULL; |
946 | e4b->alloc_semp = &grp->alloc_sem; | ||
947 | |||
948 | /* Take the read lock on the group alloc | ||
949 | * sem. This would make sure a parallel | ||
950 | * ext4_mb_init_group happening on other | ||
951 | * groups mapped by the page is blocked | ||
952 | * till we are done with allocation | ||
953 | */ | ||
954 | down_read(e4b->alloc_semp); | ||
908 | 955 | ||
909 | /* | 956 | /* |
910 | * the buddy cache inode stores the block bitmap | 957 | * the buddy cache inode stores the block bitmap |
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
920 | page = find_get_page(inode->i_mapping, pnum); | 967 | page = find_get_page(inode->i_mapping, pnum); |
921 | if (page == NULL || !PageUptodate(page)) { | 968 | if (page == NULL || !PageUptodate(page)) { |
922 | if (page) | 969 | if (page) |
970 | /* | ||
971 | * drop the page reference and try | ||
972 | * to get the page with lock. If we | ||
973 | * are not uptodate that implies | ||
974 | * somebody just created the page but | ||
975 | * is yet to initialize the same. So | ||
976 | * wait for it to initialize. | ||
977 | */ | ||
923 | page_cache_release(page); | 978 | page_cache_release(page); |
924 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 979 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
925 | if (page) { | 980 | if (page) { |
@@ -985,6 +1040,9 @@ err: | |||
985 | page_cache_release(e4b->bd_buddy_page); | 1040 | page_cache_release(e4b->bd_buddy_page); |
986 | e4b->bd_buddy = NULL; | 1041 | e4b->bd_buddy = NULL; |
987 | e4b->bd_bitmap = NULL; | 1042 | e4b->bd_bitmap = NULL; |
1043 | |||
1044 | /* Done with the buddy cache */ | ||
1045 | up_read(e4b->alloc_semp); | ||
988 | return ret; | 1046 | return ret; |
989 | } | 1047 | } |
990 | 1048 | ||
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) | |||
994 | page_cache_release(e4b->bd_bitmap_page); | 1052 | page_cache_release(e4b->bd_bitmap_page); |
995 | if (e4b->bd_buddy_page) | 1053 | if (e4b->bd_buddy_page) |
996 | page_cache_release(e4b->bd_buddy_page); | 1054 | page_cache_release(e4b->bd_buddy_page); |
1055 | /* Done with the buddy cache */ | ||
1056 | if (e4b->alloc_semp) | ||
1057 | up_read(e4b->alloc_semp); | ||
997 | } | 1058 | } |
998 | 1059 | ||
999 | 1060 | ||
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) | |||
1031 | cur += 32; | 1092 | cur += 32; |
1032 | continue; | 1093 | continue; |
1033 | } | 1094 | } |
1034 | mb_clear_bit_atomic(lock, cur, bm); | 1095 | if (lock) |
1096 | mb_clear_bit_atomic(lock, cur, bm); | ||
1097 | else | ||
1098 | mb_clear_bit(cur, bm); | ||
1035 | cur++; | 1099 | cur++; |
1036 | } | 1100 | } |
1037 | } | 1101 | } |
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | |||
1049 | cur += 32; | 1113 | cur += 32; |
1050 | continue; | 1114 | continue; |
1051 | } | 1115 | } |
1052 | mb_set_bit_atomic(lock, cur, bm); | 1116 | if (lock) |
1117 | mb_set_bit_atomic(lock, cur, bm); | ||
1118 | else | ||
1119 | mb_set_bit(cur, bm); | ||
1053 | cur++; | 1120 | cur++; |
1054 | } | 1121 | } |
1055 | } | 1122 | } |
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |||
1094 | blocknr += block; | 1161 | blocknr += block; |
1095 | blocknr += | 1162 | blocknr += |
1096 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | 1163 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
1097 | ext4_unlock_group(sb, e4b->bd_group); | 1164 | ext4_grp_locked_error(sb, e4b->bd_group, |
1098 | ext4_error(sb, __func__, "double-free of inode" | 1165 | __func__, "double-free of inode" |
1099 | " %lu's block %llu(bit %u in group %lu)\n", | 1166 | " %lu's block %llu(bit %u in group %u)", |
1100 | inode ? inode->i_ino : 0, blocknr, block, | 1167 | inode ? inode->i_ino : 0, blocknr, block, |
1101 | e4b->bd_group); | 1168 | e4b->bd_group); |
1102 | ext4_lock_group(sb, e4b->bd_group); | ||
1103 | } | 1169 | } |
1104 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); | 1170 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); |
1105 | e4b->bd_info->bb_counters[order]++; | 1171 | e4b->bd_info->bb_counters[order]++; |
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, | |||
1296 | ac->ac_tail = ret & 0xffff; | 1362 | ac->ac_tail = ret & 0xffff; |
1297 | ac->ac_buddy = ret >> 16; | 1363 | ac->ac_buddy = ret >> 16; |
1298 | 1364 | ||
1299 | /* XXXXXXX: SUCH A HORRIBLE **CK */ | 1365 | /* |
1300 | /*FIXME!! Why ? */ | 1366 | * take the page reference. We want the page to be pinned |
1367 | * so that we don't get a ext4_mb_init_cache_call for this | ||
1368 | * group until we update the bitmap. That would mean we | ||
1369 | * double allocate blocks. The reference is dropped | ||
1370 | * in ext4_mb_release_context | ||
1371 | */ | ||
1301 | ac->ac_bitmap_page = e4b->bd_bitmap_page; | 1372 | ac->ac_bitmap_page = e4b->bd_bitmap_page; |
1302 | get_page(ac->ac_bitmap_page); | 1373 | get_page(ac->ac_bitmap_page); |
1303 | ac->ac_buddy_page = e4b->bd_buddy_page; | 1374 | ac->ac_buddy_page = e4b->bd_buddy_page; |
1304 | get_page(ac->ac_buddy_page); | 1375 | get_page(ac->ac_buddy_page); |
1305 | 1376 | /* on allocation we use ac to track the held semaphore */ | |
1377 | ac->alloc_semp = e4b->alloc_semp; | ||
1378 | e4b->alloc_semp = NULL; | ||
1306 | /* store last allocated for subsequent stream allocation */ | 1379 | /* store last allocated for subsequent stream allocation */ |
1307 | if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { | 1380 | if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { |
1308 | spin_lock(&sbi->s_md_lock); | 1381 | spin_lock(&sbi->s_md_lock); |
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, | |||
1326 | struct ext4_free_extent ex; | 1399 | struct ext4_free_extent ex; |
1327 | int max; | 1400 | int max; |
1328 | 1401 | ||
1402 | if (ac->ac_status == AC_STATUS_FOUND) | ||
1403 | return; | ||
1329 | /* | 1404 | /* |
1330 | * We don't want to scan for a whole year | 1405 | * We don't want to scan for a whole year |
1331 | */ | 1406 | */ |
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | |||
1575 | * free blocks even though group info says we | 1650 | * free blocks even though group info says we |
1576 | * we have free blocks | 1651 | * we have free blocks |
1577 | */ | 1652 | */ |
1578 | ext4_error(sb, __func__, "%d free blocks as per " | 1653 | ext4_grp_locked_error(sb, e4b->bd_group, |
1579 | "group info. But bitmap says 0\n", | 1654 | __func__, "%d free blocks as per " |
1655 | "group info. But bitmap says 0", | ||
1580 | free); | 1656 | free); |
1581 | break; | 1657 | break; |
1582 | } | 1658 | } |
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | |||
1584 | mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); | 1660 | mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); |
1585 | BUG_ON(ex.fe_len <= 0); | 1661 | BUG_ON(ex.fe_len <= 0); |
1586 | if (free < ex.fe_len) { | 1662 | if (free < ex.fe_len) { |
1587 | ext4_error(sb, __func__, "%d free blocks as per " | 1663 | ext4_grp_locked_error(sb, e4b->bd_group, |
1588 | "group info. But got %d blocks\n", | 1664 | __func__, "%d free blocks as per " |
1665 | "group info. But got %d blocks", | ||
1589 | free, ex.fe_len); | 1666 | free, ex.fe_len); |
1590 | /* | 1667 | /* |
1591 | * The number of free blocks differs. This mostly | 1668 | * The number of free blocks differs. This mostly |
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, | |||
1692 | return 0; | 1769 | return 0; |
1693 | } | 1770 | } |
1694 | 1771 | ||
1772 | /* | ||
1773 | * lock the group_info alloc_sem of all the groups | ||
1774 | * belonging to the same buddy cache page. This | ||
1775 | * make sure other parallel operation on the buddy | ||
1776 | * cache doesn't happen whild holding the buddy cache | ||
1777 | * lock | ||
1778 | */ | ||
1779 | int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) | ||
1780 | { | ||
1781 | int i; | ||
1782 | int block, pnum; | ||
1783 | int blocks_per_page; | ||
1784 | int groups_per_page; | ||
1785 | ext4_group_t first_group; | ||
1786 | struct ext4_group_info *grp; | ||
1787 | |||
1788 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1789 | /* | ||
1790 | * the buddy cache inode stores the block bitmap | ||
1791 | * and buddy information in consecutive blocks. | ||
1792 | * So for each group we need two blocks. | ||
1793 | */ | ||
1794 | block = group * 2; | ||
1795 | pnum = block / blocks_per_page; | ||
1796 | first_group = pnum * blocks_per_page / 2; | ||
1797 | |||
1798 | groups_per_page = blocks_per_page >> 1; | ||
1799 | if (groups_per_page == 0) | ||
1800 | groups_per_page = 1; | ||
1801 | /* read all groups the page covers into the cache */ | ||
1802 | for (i = 0; i < groups_per_page; i++) { | ||
1803 | |||
1804 | if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) | ||
1805 | break; | ||
1806 | grp = ext4_get_group_info(sb, first_group + i); | ||
1807 | /* take all groups write allocation | ||
1808 | * semaphore. This make sure there is | ||
1809 | * no block allocation going on in any | ||
1810 | * of that groups | ||
1811 | */ | ||
1812 | down_write_nested(&grp->alloc_sem, i); | ||
1813 | } | ||
1814 | return i; | ||
1815 | } | ||
1816 | |||
1817 | void ext4_mb_put_buddy_cache_lock(struct super_block *sb, | ||
1818 | ext4_group_t group, int locked_group) | ||
1819 | { | ||
1820 | int i; | ||
1821 | int block, pnum; | ||
1822 | int blocks_per_page; | ||
1823 | ext4_group_t first_group; | ||
1824 | struct ext4_group_info *grp; | ||
1825 | |||
1826 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1827 | /* | ||
1828 | * the buddy cache inode stores the block bitmap | ||
1829 | * and buddy information in consecutive blocks. | ||
1830 | * So for each group we need two blocks. | ||
1831 | */ | ||
1832 | block = group * 2; | ||
1833 | pnum = block / blocks_per_page; | ||
1834 | first_group = pnum * blocks_per_page / 2; | ||
1835 | /* release locks on all the groups */ | ||
1836 | for (i = 0; i < locked_group; i++) { | ||
1837 | |||
1838 | grp = ext4_get_group_info(sb, first_group + i); | ||
1839 | /* take all groups write allocation | ||
1840 | * semaphore. This make sure there is | ||
1841 | * no block allocation going on in any | ||
1842 | * of that groups | ||
1843 | */ | ||
1844 | up_write(&grp->alloc_sem); | ||
1845 | } | ||
1846 | |||
1847 | } | ||
1848 | |||
1849 | static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | ||
1850 | { | ||
1851 | |||
1852 | int ret; | ||
1853 | void *bitmap; | ||
1854 | int blocks_per_page; | ||
1855 | int block, pnum, poff; | ||
1856 | int num_grp_locked = 0; | ||
1857 | struct ext4_group_info *this_grp; | ||
1858 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1859 | struct inode *inode = sbi->s_buddy_cache; | ||
1860 | struct page *page = NULL, *bitmap_page = NULL; | ||
1861 | |||
1862 | mb_debug("init group %lu\n", group); | ||
1863 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1864 | this_grp = ext4_get_group_info(sb, group); | ||
1865 | /* | ||
1866 | * This ensures we don't add group | ||
1867 | * to this buddy cache via resize | ||
1868 | */ | ||
1869 | num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); | ||
1870 | if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { | ||
1871 | /* | ||
1872 | * somebody initialized the group | ||
1873 | * return without doing anything | ||
1874 | */ | ||
1875 | ret = 0; | ||
1876 | goto err; | ||
1877 | } | ||
1878 | /* | ||
1879 | * the buddy cache inode stores the block bitmap | ||
1880 | * and buddy information in consecutive blocks. | ||
1881 | * So for each group we need two blocks. | ||
1882 | */ | ||
1883 | block = group * 2; | ||
1884 | pnum = block / blocks_per_page; | ||
1885 | poff = block % blocks_per_page; | ||
1886 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1887 | if (page) { | ||
1888 | BUG_ON(page->mapping != inode->i_mapping); | ||
1889 | ret = ext4_mb_init_cache(page, NULL); | ||
1890 | if (ret) { | ||
1891 | unlock_page(page); | ||
1892 | goto err; | ||
1893 | } | ||
1894 | unlock_page(page); | ||
1895 | } | ||
1896 | if (page == NULL || !PageUptodate(page)) { | ||
1897 | ret = -EIO; | ||
1898 | goto err; | ||
1899 | } | ||
1900 | mark_page_accessed(page); | ||
1901 | bitmap_page = page; | ||
1902 | bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
1903 | |||
1904 | /* init buddy cache */ | ||
1905 | block++; | ||
1906 | pnum = block / blocks_per_page; | ||
1907 | poff = block % blocks_per_page; | ||
1908 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1909 | if (page == bitmap_page) { | ||
1910 | /* | ||
1911 | * If both the bitmap and buddy are in | ||
1912 | * the same page we don't need to force | ||
1913 | * init the buddy | ||
1914 | */ | ||
1915 | unlock_page(page); | ||
1916 | } else if (page) { | ||
1917 | BUG_ON(page->mapping != inode->i_mapping); | ||
1918 | ret = ext4_mb_init_cache(page, bitmap); | ||
1919 | if (ret) { | ||
1920 | unlock_page(page); | ||
1921 | goto err; | ||
1922 | } | ||
1923 | unlock_page(page); | ||
1924 | } | ||
1925 | if (page == NULL || !PageUptodate(page)) { | ||
1926 | ret = -EIO; | ||
1927 | goto err; | ||
1928 | } | ||
1929 | mark_page_accessed(page); | ||
1930 | err: | ||
1931 | ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); | ||
1932 | if (bitmap_page) | ||
1933 | page_cache_release(bitmap_page); | ||
1934 | if (page) | ||
1935 | page_cache_release(page); | ||
1936 | return ret; | ||
1937 | } | ||
1938 | |||
1695 | static noinline_for_stack int | 1939 | static noinline_for_stack int |
1696 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | 1940 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) |
1697 | { | 1941 | { |
@@ -1775,7 +2019,7 @@ repeat: | |||
1775 | group = 0; | 2019 | group = 0; |
1776 | 2020 | ||
1777 | /* quick check to skip empty groups */ | 2021 | /* quick check to skip empty groups */ |
1778 | grp = ext4_get_group_info(ac->ac_sb, group); | 2022 | grp = ext4_get_group_info(sb, group); |
1779 | if (grp->bb_free == 0) | 2023 | if (grp->bb_free == 0) |
1780 | continue; | 2024 | continue; |
1781 | 2025 | ||
@@ -1788,10 +2032,9 @@ repeat: | |||
1788 | * we need full data about the group | 2032 | * we need full data about the group |
1789 | * to make a good selection | 2033 | * to make a good selection |
1790 | */ | 2034 | */ |
1791 | err = ext4_mb_load_buddy(sb, group, &e4b); | 2035 | err = ext4_mb_init_group(sb, group); |
1792 | if (err) | 2036 | if (err) |
1793 | goto out; | 2037 | goto out; |
1794 | ext4_mb_release_desc(&e4b); | ||
1795 | } | 2038 | } |
1796 | 2039 | ||
1797 | /* | 2040 | /* |
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) | |||
1932 | if (hs->op == EXT4_MB_HISTORY_ALLOC) { | 2175 | if (hs->op == EXT4_MB_HISTORY_ALLOC) { |
1933 | fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " | 2176 | fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " |
1934 | "%-5u %-5s %-5u %-6u\n"; | 2177 | "%-5u %-5s %-5u %-6u\n"; |
1935 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | 2178 | sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, |
1936 | hs->result.fe_start, hs->result.fe_len, | 2179 | hs->result.fe_start, hs->result.fe_len, |
1937 | hs->result.fe_logical); | 2180 | hs->result.fe_logical); |
1938 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | 2181 | sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, |
1939 | hs->orig.fe_start, hs->orig.fe_len, | 2182 | hs->orig.fe_start, hs->orig.fe_len, |
1940 | hs->orig.fe_logical); | 2183 | hs->orig.fe_logical); |
1941 | sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, | 2184 | sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, |
1942 | hs->goal.fe_start, hs->goal.fe_len, | 2185 | hs->goal.fe_start, hs->goal.fe_len, |
1943 | hs->goal.fe_logical); | 2186 | hs->goal.fe_logical); |
1944 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, | 2187 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, |
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) | |||
1947 | hs->buddy ? 1 << hs->buddy : 0); | 2190 | hs->buddy ? 1 << hs->buddy : 0); |
1948 | } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { | 2191 | } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { |
1949 | fmt = "%-5u %-8u %-23s %-23s %-23s\n"; | 2192 | fmt = "%-5u %-8u %-23s %-23s %-23s\n"; |
1950 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | 2193 | sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, |
1951 | hs->result.fe_start, hs->result.fe_len, | 2194 | hs->result.fe_start, hs->result.fe_len, |
1952 | hs->result.fe_logical); | 2195 | hs->result.fe_logical); |
1953 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | 2196 | sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, |
1954 | hs->orig.fe_start, hs->orig.fe_len, | 2197 | hs->orig.fe_start, hs->orig.fe_len, |
1955 | hs->orig.fe_logical); | 2198 | hs->orig.fe_logical); |
1956 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); | 2199 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); |
1957 | } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { | 2200 | } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { |
1958 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | 2201 | sprintf(buf2, "%u/%d/%u", hs->result.fe_group, |
1959 | hs->result.fe_start, hs->result.fe_len); | 2202 | hs->result.fe_start, hs->result.fe_len); |
1960 | seq_printf(seq, "%-5u %-8u %-23s discard\n", | 2203 | seq_printf(seq, "%-5u %-8u %-23s discard\n", |
1961 | hs->pid, hs->ino, buf2); | 2204 | hs->pid, hs->ino, buf2); |
1962 | } else if (hs->op == EXT4_MB_HISTORY_FREE) { | 2205 | } else if (hs->op == EXT4_MB_HISTORY_FREE) { |
1963 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | 2206 | sprintf(buf2, "%u/%d/%u", hs->result.fe_group, |
1964 | hs->result.fe_start, hs->result.fe_len); | 2207 | hs->result.fe_start, hs->result.fe_len); |
1965 | seq_printf(seq, "%-5u %-8u %-23s free\n", | 2208 | seq_printf(seq, "%-5u %-8u %-23s free\n", |
1966 | hs->pid, hs->ino, buf2); | 2209 | hs->pid, hs->ino, buf2); |
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) | |||
2073 | return NULL; | 2316 | return NULL; |
2074 | 2317 | ||
2075 | group = *pos + 1; | 2318 | group = *pos + 1; |
2076 | return (void *) group; | 2319 | return (void *) ((unsigned long) group); |
2077 | } | 2320 | } |
2078 | 2321 | ||
2079 | static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) | 2322 | static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) |
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2086 | if (*pos < 0 || *pos >= sbi->s_groups_count) | 2329 | if (*pos < 0 || *pos >= sbi->s_groups_count) |
2087 | return NULL; | 2330 | return NULL; |
2088 | group = *pos + 1; | 2331 | group = *pos + 1; |
2089 | return (void *) group;; | 2332 | return (void *) ((unsigned long) group); |
2090 | } | 2333 | } |
2091 | 2334 | ||
2092 | static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) | 2335 | static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) |
2093 | { | 2336 | { |
2094 | struct super_block *sb = seq->private; | 2337 | struct super_block *sb = seq->private; |
2095 | long group = (long) v; | 2338 | ext4_group_t group = (ext4_group_t) ((unsigned long) v); |
2096 | int i; | 2339 | int i; |
2097 | int err; | 2340 | int err; |
2098 | struct ext4_buddy e4b; | 2341 | struct ext4_buddy e4b; |
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) | |||
2114 | sizeof(struct ext4_group_info); | 2357 | sizeof(struct ext4_group_info); |
2115 | err = ext4_mb_load_buddy(sb, group, &e4b); | 2358 | err = ext4_mb_load_buddy(sb, group, &e4b); |
2116 | if (err) { | 2359 | if (err) { |
2117 | seq_printf(seq, "#%-5lu: I/O error\n", group); | 2360 | seq_printf(seq, "#%-5u: I/O error\n", group); |
2118 | return 0; | 2361 | return 0; |
2119 | } | 2362 | } |
2120 | ext4_lock_group(sb, group); | 2363 | ext4_lock_group(sb, group); |
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) | |||
2122 | ext4_unlock_group(sb, group); | 2365 | ext4_unlock_group(sb, group); |
2123 | ext4_mb_release_desc(&e4b); | 2366 | ext4_mb_release_desc(&e4b); |
2124 | 2367 | ||
2125 | seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, | 2368 | seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, |
2126 | sg.info.bb_fragments, sg.info.bb_first_free); | 2369 | sg.info.bb_fragments, sg.info.bb_first_free); |
2127 | for (i = 0; i <= 13; i++) | 2370 | for (i = 0; i <= 13; i++) |
2128 | seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? | 2371 | seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? |
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2296 | ext4_free_blocks_after_init(sb, group, desc); | 2539 | ext4_free_blocks_after_init(sb, group, desc); |
2297 | } else { | 2540 | } else { |
2298 | meta_group_info[i]->bb_free = | 2541 | meta_group_info[i]->bb_free = |
2299 | le16_to_cpu(desc->bg_free_blocks_count); | 2542 | ext4_free_blks_count(sb, desc); |
2300 | } | 2543 | } |
2301 | 2544 | ||
2302 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | 2545 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); |
2546 | init_rwsem(&meta_group_info[i]->alloc_sem); | ||
2303 | meta_group_info[i]->bb_free_root.rb_node = NULL;; | 2547 | meta_group_info[i]->bb_free_root.rb_node = NULL;; |
2304 | 2548 | ||
2305 | #ifdef DOUBLE_CHECK | 2549 | #ifdef DOUBLE_CHECK |
@@ -2327,54 +2571,6 @@ exit_meta_group_info: | |||
2327 | } /* ext4_mb_add_groupinfo */ | 2571 | } /* ext4_mb_add_groupinfo */ |
2328 | 2572 | ||
2329 | /* | 2573 | /* |
2330 | * Add a group to the existing groups. | ||
2331 | * This function is used for online resize | ||
2332 | */ | ||
2333 | int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, | ||
2334 | struct ext4_group_desc *desc) | ||
2335 | { | ||
2336 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2337 | struct inode *inode = sbi->s_buddy_cache; | ||
2338 | int blocks_per_page; | ||
2339 | int block; | ||
2340 | int pnum; | ||
2341 | struct page *page; | ||
2342 | int err; | ||
2343 | |||
2344 | /* Add group based on group descriptor*/ | ||
2345 | err = ext4_mb_add_groupinfo(sb, group, desc); | ||
2346 | if (err) | ||
2347 | return err; | ||
2348 | |||
2349 | /* | ||
2350 | * Cache pages containing dynamic mb_alloc datas (buddy and bitmap | ||
2351 | * datas) are set not up to date so that they will be re-initilaized | ||
2352 | * during the next call to ext4_mb_load_buddy | ||
2353 | */ | ||
2354 | |||
2355 | /* Set buddy page as not up to date */ | ||
2356 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
2357 | block = group * 2; | ||
2358 | pnum = block / blocks_per_page; | ||
2359 | page = find_get_page(inode->i_mapping, pnum); | ||
2360 | if (page != NULL) { | ||
2361 | ClearPageUptodate(page); | ||
2362 | page_cache_release(page); | ||
2363 | } | ||
2364 | |||
2365 | /* Set bitmap page as not up to date */ | ||
2366 | block++; | ||
2367 | pnum = block / blocks_per_page; | ||
2368 | page = find_get_page(inode->i_mapping, pnum); | ||
2369 | if (page != NULL) { | ||
2370 | ClearPageUptodate(page); | ||
2371 | page_cache_release(page); | ||
2372 | } | ||
2373 | |||
2374 | return 0; | ||
2375 | } | ||
2376 | |||
2377 | /* | ||
2378 | * Update an existing group. | 2574 | * Update an existing group. |
2379 | * This function is used for online resize | 2575 | * This function is used for online resize |
2380 | */ | 2576 | */ |
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb) | |||
2457 | desc = ext4_get_group_desc(sb, i, NULL); | 2653 | desc = ext4_get_group_desc(sb, i, NULL); |
2458 | if (desc == NULL) { | 2654 | if (desc == NULL) { |
2459 | printk(KERN_ERR | 2655 | printk(KERN_ERR |
2460 | "EXT4-fs: can't read descriptor %lu\n", i); | 2656 | "EXT4-fs: can't read descriptor %u\n", i); |
2461 | goto err_freebuddy; | 2657 | goto err_freebuddy; |
2462 | } | 2658 | } |
2463 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) | 2659 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2493 | if (sbi->s_mb_offsets == NULL) { | 2689 | if (sbi->s_mb_offsets == NULL) { |
2494 | return -ENOMEM; | 2690 | return -ENOMEM; |
2495 | } | 2691 | } |
2692 | |||
2693 | i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); | ||
2496 | sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); | 2694 | sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); |
2497 | if (sbi->s_mb_maxs == NULL) { | 2695 | if (sbi->s_mb_maxs == NULL) { |
2498 | kfree(sbi->s_mb_maxs); | 2696 | kfree(sbi->s_mb_maxs); |
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2551 | ext4_mb_init_per_dev_proc(sb); | 2749 | ext4_mb_init_per_dev_proc(sb); |
2552 | ext4_mb_history_init(sb); | 2750 | ext4_mb_history_init(sb); |
2553 | 2751 | ||
2554 | sbi->s_journal->j_commit_callback = release_blocks_on_commit; | 2752 | if (sbi->s_journal) |
2753 | sbi->s_journal->j_commit_callback = release_blocks_on_commit; | ||
2555 | 2754 | ||
2556 | printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); | 2755 | printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); |
2557 | return 0; | 2756 | return 0; |
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | |||
2652 | list_for_each_safe(l, ltmp, &txn->t_private_list) { | 2851 | list_for_each_safe(l, ltmp, &txn->t_private_list) { |
2653 | entry = list_entry(l, struct ext4_free_data, list); | 2852 | entry = list_entry(l, struct ext4_free_data, list); |
2654 | 2853 | ||
2655 | mb_debug("gonna free %u blocks in group %lu (0x%p):", | 2854 | mb_debug("gonna free %u blocks in group %u (0x%p):", |
2656 | entry->count, entry->group, entry); | 2855 | entry->count, entry->group, entry); |
2657 | 2856 | ||
2658 | err = ext4_mb_load_buddy(sb, entry->group, &e4b); | 2857 | err = ext4_mb_load_buddy(sb, entry->group, &e4b); |
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | |||
2679 | discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) | 2878 | discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) |
2680 | + entry->start_blk | 2879 | + entry->start_blk |
2681 | + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | 2880 | + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
2682 | trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, | 2881 | trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", |
2683 | (unsigned long long) discard_block, entry->count); | 2882 | sb->s_id, (unsigned long long) discard_block, |
2883 | entry->count); | ||
2684 | sb_issue_discard(sb, discard_block, entry->count); | 2884 | sb_issue_discard(sb, discard_block, entry->count); |
2685 | 2885 | ||
2686 | kmem_cache_free(ext4_free_ext_cachep, entry); | 2886 | kmem_cache_free(ext4_free_ext_cachep, entry); |
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void) | |||
2791 | */ | 2991 | */ |
2792 | static noinline_for_stack int | 2992 | static noinline_for_stack int |
2793 | ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | 2993 | ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, |
2794 | handle_t *handle, unsigned long reserv_blks) | 2994 | handle_t *handle, unsigned int reserv_blks) |
2795 | { | 2995 | { |
2796 | struct buffer_head *bitmap_bh = NULL; | 2996 | struct buffer_head *bitmap_bh = NULL; |
2797 | struct ext4_super_block *es; | 2997 | struct ext4_super_block *es; |
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2824 | if (!gdp) | 3024 | if (!gdp) |
2825 | goto out_err; | 3025 | goto out_err; |
2826 | 3026 | ||
2827 | ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, | 3027 | ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, |
2828 | gdp->bg_free_blocks_count); | 3028 | gdp->bg_free_blocks_count); |
2829 | 3029 | ||
2830 | err = ext4_journal_get_write_access(handle, gdp_bh); | 3030 | err = ext4_journal_get_write_access(handle, gdp_bh); |
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2843 | in_range(block + len - 1, ext4_inode_table(sb, gdp), | 3043 | in_range(block + len - 1, ext4_inode_table(sb, gdp), |
2844 | EXT4_SB(sb)->s_itb_per_group)) { | 3044 | EXT4_SB(sb)->s_itb_per_group)) { |
2845 | ext4_error(sb, __func__, | 3045 | ext4_error(sb, __func__, |
2846 | "Allocating block in system zone - block = %llu", | 3046 | "Allocating block %llu in system zone of %d group\n", |
2847 | block); | 3047 | block, ac->ac_b_ex.fe_group); |
2848 | /* File system mounted not to panic on error | 3048 | /* File system mounted not to panic on error |
2849 | * Fix the bitmap and repeat the block allocation | 3049 | * Fix the bitmap and repeat the block allocation |
2850 | * We leak some of the blocks here. | 3050 | * We leak some of the blocks here. |
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2852 | mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), | 3052 | mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), |
2853 | bitmap_bh->b_data, ac->ac_b_ex.fe_start, | 3053 | bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2854 | ac->ac_b_ex.fe_len); | 3054 | ac->ac_b_ex.fe_len); |
2855 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 3055 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
2856 | if (!err) | 3056 | if (!err) |
2857 | err = -EAGAIN; | 3057 | err = -EAGAIN; |
2858 | goto out_err; | 3058 | goto out_err; |
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2866 | } | 3066 | } |
2867 | } | 3067 | } |
2868 | #endif | 3068 | #endif |
2869 | mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, | ||
2870 | ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
2871 | |||
2872 | spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | 3069 | spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); |
3070 | mb_set_bits(NULL, bitmap_bh->b_data, | ||
3071 | ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
2873 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 3072 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
2874 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | 3073 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
2875 | gdp->bg_free_blocks_count = | 3074 | ext4_free_blks_set(sb, gdp, |
2876 | cpu_to_le16(ext4_free_blocks_after_init(sb, | 3075 | ext4_free_blocks_after_init(sb, |
2877 | ac->ac_b_ex.fe_group, | 3076 | ac->ac_b_ex.fe_group, gdp)); |
2878 | gdp)); | ||
2879 | } | 3077 | } |
2880 | le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); | 3078 | len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; |
3079 | ext4_free_blks_set(sb, gdp, len); | ||
2881 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); | 3080 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); |
2882 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | 3081 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); |
2883 | percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); | 3082 | percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); |
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2899 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | 3098 | spin_unlock(sb_bgl_lock(sbi, flex_group)); |
2900 | } | 3099 | } |
2901 | 3100 | ||
2902 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 3101 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
2903 | if (err) | 3102 | if (err) |
2904 | goto out_err; | 3103 | goto out_err; |
2905 | err = ext4_journal_dirty_metadata(handle, gdp_bh); | 3104 | err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); |
2906 | 3105 | ||
2907 | out_err: | 3106 | out_err: |
2908 | sb->s_dirt = 1; | 3107 | sb->s_dirt = 1; |
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, | |||
3031 | /* check we don't cross already preallocated blocks */ | 3230 | /* check we don't cross already preallocated blocks */ |
3032 | rcu_read_lock(); | 3231 | rcu_read_lock(); |
3033 | list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { | 3232 | list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { |
3034 | unsigned long pa_end; | 3233 | ext4_lblk_t pa_end; |
3035 | 3234 | ||
3036 | if (pa->pa_deleted) | 3235 | if (pa->pa_deleted) |
3037 | continue; | 3236 | continue; |
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, | |||
3075 | /* XXX: extra loop to check we really don't overlap preallocations */ | 3274 | /* XXX: extra loop to check we really don't overlap preallocations */ |
3076 | rcu_read_lock(); | 3275 | rcu_read_lock(); |
3077 | list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { | 3276 | list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { |
3078 | unsigned long pa_end; | 3277 | ext4_lblk_t pa_end; |
3079 | spin_lock(&pa->pa_lock); | 3278 | spin_lock(&pa->pa_lock); |
3080 | if (pa->pa_deleted == 0) { | 3279 | if (pa->pa_deleted == 0) { |
3081 | pa_end = pa->pa_lstart + pa->pa_len; | 3280 | pa_end = pa->pa_lstart + pa->pa_len; |
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | |||
3307 | } | 3506 | } |
3308 | 3507 | ||
3309 | /* | 3508 | /* |
3509 | * the function goes through all block freed in the group | ||
3510 | * but not yet committed and marks them used in in-core bitmap. | ||
3511 | * buddy must be generated from this bitmap | ||
3512 | * Need to be called with ext4 group lock (ext4_lock_group) | ||
3513 | */ | ||
3514 | static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | ||
3515 | ext4_group_t group) | ||
3516 | { | ||
3517 | struct rb_node *n; | ||
3518 | struct ext4_group_info *grp; | ||
3519 | struct ext4_free_data *entry; | ||
3520 | |||
3521 | grp = ext4_get_group_info(sb, group); | ||
3522 | n = rb_first(&(grp->bb_free_root)); | ||
3523 | |||
3524 | while (n) { | ||
3525 | entry = rb_entry(n, struct ext4_free_data, node); | ||
3526 | mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), | ||
3527 | bitmap, entry->start_blk, | ||
3528 | entry->count); | ||
3529 | n = rb_next(n); | ||
3530 | } | ||
3531 | return; | ||
3532 | } | ||
3533 | |||
3534 | /* | ||
3310 | * the function goes through all preallocation in this group and marks them | 3535 | * the function goes through all preallocation in this group and marks them |
3311 | * used in in-core bitmap. buddy must be generated from this bitmap | 3536 | * used in in-core bitmap. buddy must be generated from this bitmap |
3312 | * Need to be called with ext4 group lock (ext4_lock_group) | 3537 | * Need to be called with ext4 group lock (ext4_lock_group) |
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |||
3346 | preallocated += len; | 3571 | preallocated += len; |
3347 | count++; | 3572 | count++; |
3348 | } | 3573 | } |
3349 | mb_debug("prellocated %u for group %lu\n", preallocated, group); | 3574 | mb_debug("prellocated %u for group %u\n", preallocated, group); |
3350 | } | 3575 | } |
3351 | 3576 | ||
3352 | static void ext4_mb_pa_callback(struct rcu_head *head) | 3577 | static void ext4_mb_pa_callback(struct rcu_head *head) |
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head) | |||
3363 | static void ext4_mb_put_pa(struct ext4_allocation_context *ac, | 3588 | static void ext4_mb_put_pa(struct ext4_allocation_context *ac, |
3364 | struct super_block *sb, struct ext4_prealloc_space *pa) | 3589 | struct super_block *sb, struct ext4_prealloc_space *pa) |
3365 | { | 3590 | { |
3366 | unsigned long grp; | 3591 | ext4_group_t grp; |
3367 | 3592 | ||
3368 | if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) | 3593 | if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) |
3369 | return; | 3594 | return; |
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) | |||
3473 | 3698 | ||
3474 | mb_debug("new inode pa %p: %llu/%u for %u\n", pa, | 3699 | mb_debug("new inode pa %p: %llu/%u for %u\n", pa, |
3475 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | 3700 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); |
3701 | trace_mark(ext4_mb_new_inode_pa, | ||
3702 | "dev %s ino %lu pstart %llu len %u lstart %u", | ||
3703 | sb->s_id, ac->ac_inode->i_ino, | ||
3704 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3476 | 3705 | ||
3477 | ext4_mb_use_inode_pa(ac, pa); | 3706 | ext4_mb_use_inode_pa(ac, pa); |
3478 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | 3707 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); |
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | |||
3530 | pa->pa_linear = 1; | 3759 | pa->pa_linear = 1; |
3531 | 3760 | ||
3532 | mb_debug("new group pa %p: %llu/%u for %u\n", pa, | 3761 | mb_debug("new group pa %p: %llu/%u for %u\n", pa, |
3533 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | 3762 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); |
3763 | trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", | ||
3764 | sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3534 | 3765 | ||
3535 | ext4_mb_use_group_pa(ac, pa); | 3766 | ext4_mb_use_group_pa(ac, pa); |
3536 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | 3767 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); |
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3579 | { | 3810 | { |
3580 | struct super_block *sb = e4b->bd_sb; | 3811 | struct super_block *sb = e4b->bd_sb; |
3581 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 3812 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
3582 | unsigned long end; | 3813 | unsigned int end; |
3583 | unsigned long next; | 3814 | unsigned int next; |
3584 | ext4_group_t group; | 3815 | ext4_group_t group; |
3585 | ext4_grpblk_t bit; | 3816 | ext4_grpblk_t bit; |
3817 | unsigned long long grp_blk_start; | ||
3586 | sector_t start; | 3818 | sector_t start; |
3587 | int err = 0; | 3819 | int err = 0; |
3588 | int free = 0; | 3820 | int free = 0; |
3589 | 3821 | ||
3590 | BUG_ON(pa->pa_deleted == 0); | 3822 | BUG_ON(pa->pa_deleted == 0); |
3591 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | 3823 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); |
3824 | grp_blk_start = pa->pa_pstart - bit; | ||
3592 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | 3825 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); |
3593 | end = bit + pa->pa_len; | 3826 | end = bit + pa->pa_len; |
3594 | 3827 | ||
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3618 | ext4_mb_store_history(ac); | 3851 | ext4_mb_store_history(ac); |
3619 | } | 3852 | } |
3620 | 3853 | ||
3854 | trace_mark(ext4_mb_release_inode_pa, | ||
3855 | "dev %s ino %lu block %llu count %u", | ||
3856 | sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, | ||
3857 | next - bit); | ||
3621 | mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); | 3858 | mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); |
3622 | bit = next + 1; | 3859 | bit = next + 1; |
3623 | } | 3860 | } |
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3626 | pa, (unsigned long) pa->pa_lstart, | 3863 | pa, (unsigned long) pa->pa_lstart, |
3627 | (unsigned long) pa->pa_pstart, | 3864 | (unsigned long) pa->pa_pstart, |
3628 | (unsigned long) pa->pa_len); | 3865 | (unsigned long) pa->pa_len); |
3629 | ext4_error(sb, __func__, "free %u, pa_free %u\n", | 3866 | ext4_grp_locked_error(sb, group, |
3630 | free, pa->pa_free); | 3867 | __func__, "free %u, pa_free %u", |
3868 | free, pa->pa_free); | ||
3631 | /* | 3869 | /* |
3632 | * pa is already deleted so we use the value obtained | 3870 | * pa is already deleted so we use the value obtained |
3633 | * from the bitmap and continue. | 3871 | * from the bitmap and continue. |
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, | |||
3650 | if (ac) | 3888 | if (ac) |
3651 | ac->ac_op = EXT4_MB_HISTORY_DISCARD; | 3889 | ac->ac_op = EXT4_MB_HISTORY_DISCARD; |
3652 | 3890 | ||
3891 | trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", | ||
3892 | sb->s_id, pa->pa_pstart, pa->pa_len); | ||
3653 | BUG_ON(pa->pa_deleted == 0); | 3893 | BUG_ON(pa->pa_deleted == 0); |
3654 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | 3894 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); |
3655 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | 3895 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); |
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, | |||
3692 | int busy = 0; | 3932 | int busy = 0; |
3693 | int free = 0; | 3933 | int free = 0; |
3694 | 3934 | ||
3695 | mb_debug("discard preallocation for group %lu\n", group); | 3935 | mb_debug("discard preallocation for group %u\n", group); |
3696 | 3936 | ||
3697 | if (list_empty(&grp->bb_prealloc_list)) | 3937 | if (list_empty(&grp->bb_prealloc_list)) |
3698 | return 0; | 3938 | return 0; |
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, | |||
3700 | bitmap_bh = ext4_read_block_bitmap(sb, group); | 3940 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
3701 | if (bitmap_bh == NULL) { | 3941 | if (bitmap_bh == NULL) { |
3702 | ext4_error(sb, __func__, "Error in reading block " | 3942 | ext4_error(sb, __func__, "Error in reading block " |
3703 | "bitmap for %lu\n", group); | 3943 | "bitmap for %u", group); |
3704 | return 0; | 3944 | return 0; |
3705 | } | 3945 | } |
3706 | 3946 | ||
3707 | err = ext4_mb_load_buddy(sb, group, &e4b); | 3947 | err = ext4_mb_load_buddy(sb, group, &e4b); |
3708 | if (err) { | 3948 | if (err) { |
3709 | ext4_error(sb, __func__, "Error in loading buddy " | 3949 | ext4_error(sb, __func__, "Error in loading buddy " |
3710 | "information for %lu\n", group); | 3950 | "information for %u", group); |
3711 | put_bh(bitmap_bh); | 3951 | put_bh(bitmap_bh); |
3712 | return 0; | 3952 | return 0; |
3713 | } | 3953 | } |
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode) | |||
3815 | } | 4055 | } |
3816 | 4056 | ||
3817 | mb_debug("discard preallocation for inode %lu\n", inode->i_ino); | 4057 | mb_debug("discard preallocation for inode %lu\n", inode->i_ino); |
4058 | trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, | ||
4059 | inode->i_ino); | ||
3818 | 4060 | ||
3819 | INIT_LIST_HEAD(&list); | 4061 | INIT_LIST_HEAD(&list); |
3820 | 4062 | ||
@@ -3874,14 +4116,14 @@ repeat: | |||
3874 | err = ext4_mb_load_buddy(sb, group, &e4b); | 4116 | err = ext4_mb_load_buddy(sb, group, &e4b); |
3875 | if (err) { | 4117 | if (err) { |
3876 | ext4_error(sb, __func__, "Error in loading buddy " | 4118 | ext4_error(sb, __func__, "Error in loading buddy " |
3877 | "information for %lu\n", group); | 4119 | "information for %u", group); |
3878 | continue; | 4120 | continue; |
3879 | } | 4121 | } |
3880 | 4122 | ||
3881 | bitmap_bh = ext4_read_block_bitmap(sb, group); | 4123 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
3882 | if (bitmap_bh == NULL) { | 4124 | if (bitmap_bh == NULL) { |
3883 | ext4_error(sb, __func__, "Error in reading block " | 4125 | ext4_error(sb, __func__, "Error in reading block " |
3884 | "bitmap for %lu\n", group); | 4126 | "bitmap for %u", group); |
3885 | ext4_mb_release_desc(&e4b); | 4127 | ext4_mb_release_desc(&e4b); |
3886 | continue; | 4128 | continue; |
3887 | } | 4129 | } |
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, | |||
4024 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 4266 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
4025 | struct ext4_super_block *es = sbi->s_es; | 4267 | struct ext4_super_block *es = sbi->s_es; |
4026 | ext4_group_t group; | 4268 | ext4_group_t group; |
4027 | unsigned long len; | 4269 | unsigned int len; |
4028 | unsigned long goal; | 4270 | ext4_fsblk_t goal; |
4029 | ext4_grpblk_t block; | 4271 | ext4_grpblk_t block; |
4030 | 4272 | ||
4031 | /* we can't allocate > group size */ | 4273 | /* we can't allocate > group size */ |
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, | |||
4068 | ac->ac_pa = NULL; | 4310 | ac->ac_pa = NULL; |
4069 | ac->ac_bitmap_page = NULL; | 4311 | ac->ac_bitmap_page = NULL; |
4070 | ac->ac_buddy_page = NULL; | 4312 | ac->ac_buddy_page = NULL; |
4313 | ac->alloc_semp = NULL; | ||
4071 | ac->ac_lg = NULL; | 4314 | ac->ac_lg = NULL; |
4072 | 4315 | ||
4073 | /* we have to define context: we'll we work with a file or | 4316 | /* we have to define context: we'll we work with a file or |
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, | |||
4146 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | 4389 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); |
4147 | if (ext4_mb_load_buddy(sb, group, &e4b)) { | 4390 | if (ext4_mb_load_buddy(sb, group, &e4b)) { |
4148 | ext4_error(sb, __func__, "Error in loading buddy " | 4391 | ext4_error(sb, __func__, "Error in loading buddy " |
4149 | "information for %lu\n", group); | 4392 | "information for %u", group); |
4150 | continue; | 4393 | continue; |
4151 | } | 4394 | } |
4152 | ext4_lock_group(sb, group); | 4395 | ext4_lock_group(sb, group); |
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) | |||
4248 | } | 4491 | } |
4249 | ext4_mb_put_pa(ac, ac->ac_sb, pa); | 4492 | ext4_mb_put_pa(ac, ac->ac_sb, pa); |
4250 | } | 4493 | } |
4494 | if (ac->alloc_semp) | ||
4495 | up_read(ac->alloc_semp); | ||
4251 | if (ac->ac_bitmap_page) | 4496 | if (ac->ac_bitmap_page) |
4252 | page_cache_release(ac->ac_bitmap_page); | 4497 | page_cache_release(ac->ac_bitmap_page); |
4253 | if (ac->ac_buddy_page) | 4498 | if (ac->ac_buddy_page) |
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) | |||
4264 | int ret; | 4509 | int ret; |
4265 | int freed = 0; | 4510 | int freed = 0; |
4266 | 4511 | ||
4512 | trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", | ||
4513 | sb->s_id, needed); | ||
4267 | for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { | 4514 | for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { |
4268 | ret = ext4_mb_discard_group_preallocations(sb, i, needed); | 4515 | ret = ext4_mb_discard_group_preallocations(sb, i, needed); |
4269 | freed += ret; | 4516 | freed += ret; |
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4286 | struct ext4_sb_info *sbi; | 4533 | struct ext4_sb_info *sbi; |
4287 | struct super_block *sb; | 4534 | struct super_block *sb; |
4288 | ext4_fsblk_t block = 0; | 4535 | ext4_fsblk_t block = 0; |
4289 | unsigned long inquota; | 4536 | unsigned int inquota; |
4290 | unsigned long reserv_blks = 0; | 4537 | unsigned int reserv_blks = 0; |
4291 | 4538 | ||
4292 | sb = ar->inode->i_sb; | 4539 | sb = ar->inode->i_sb; |
4293 | sbi = EXT4_SB(sb); | 4540 | sbi = EXT4_SB(sb); |
4294 | 4541 | ||
4542 | trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " | ||
4543 | "lblk %llu goal %llu lleft %llu lright %llu " | ||
4544 | "pleft %llu pright %llu ", | ||
4545 | sb->s_id, ar->flags, ar->len, | ||
4546 | ar->inode ? ar->inode->i_ino : 0, | ||
4547 | (unsigned long long) ar->logical, | ||
4548 | (unsigned long long) ar->goal, | ||
4549 | (unsigned long long) ar->lleft, | ||
4550 | (unsigned long long) ar->lright, | ||
4551 | (unsigned long long) ar->pleft, | ||
4552 | (unsigned long long) ar->pright); | ||
4553 | |||
4295 | if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { | 4554 | if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { |
4296 | /* | 4555 | /* |
4297 | * With delalloc we already reserved the blocks | 4556 | * With delalloc we already reserved the blocks |
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4313 | } | 4572 | } |
4314 | if (ar->len == 0) { | 4573 | if (ar->len == 0) { |
4315 | *errp = -EDQUOT; | 4574 | *errp = -EDQUOT; |
4316 | return 0; | 4575 | goto out3; |
4317 | } | 4576 | } |
4318 | inquota = ar->len; | 4577 | inquota = ar->len; |
4319 | 4578 | ||
@@ -4348,10 +4607,14 @@ repeat: | |||
4348 | ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) | 4607 | ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) |
4349 | ext4_mb_new_preallocation(ac); | 4608 | ext4_mb_new_preallocation(ac); |
4350 | } | 4609 | } |
4351 | |||
4352 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { | 4610 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { |
4353 | *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); | 4611 | *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); |
4354 | if (*errp == -EAGAIN) { | 4612 | if (*errp == -EAGAIN) { |
4613 | /* | ||
4614 | * drop the reference that we took | ||
4615 | * in ext4_mb_use_best_found | ||
4616 | */ | ||
4617 | ext4_mb_release_context(ac); | ||
4355 | ac->ac_b_ex.fe_group = 0; | 4618 | ac->ac_b_ex.fe_group = 0; |
4356 | ac->ac_b_ex.fe_start = 0; | 4619 | ac->ac_b_ex.fe_start = 0; |
4357 | ac->ac_b_ex.fe_len = 0; | 4620 | ac->ac_b_ex.fe_len = 0; |
@@ -4382,6 +4645,26 @@ out2: | |||
4382 | out1: | 4645 | out1: |
4383 | if (ar->len < inquota) | 4646 | if (ar->len < inquota) |
4384 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); | 4647 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); |
4648 | out3: | ||
4649 | if (!ar->len) { | ||
4650 | if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) | ||
4651 | /* release all the reserved blocks if non delalloc */ | ||
4652 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, | ||
4653 | reserv_blks); | ||
4654 | } | ||
4655 | |||
4656 | trace_mark(ext4_allocate_blocks, | ||
4657 | "dev %s block %llu flags %u len %u ino %lu " | ||
4658 | "logical %llu goal %llu lleft %llu lright %llu " | ||
4659 | "pleft %llu pright %llu ", | ||
4660 | sb->s_id, (unsigned long long) block, | ||
4661 | ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, | ||
4662 | (unsigned long long) ar->logical, | ||
4663 | (unsigned long long) ar->goal, | ||
4664 | (unsigned long long) ar->lleft, | ||
4665 | (unsigned long long) ar->lright, | ||
4666 | (unsigned long long) ar->pleft, | ||
4667 | (unsigned long long) ar->pright); | ||
4385 | 4668 | ||
4386 | return block; | 4669 | return block; |
4387 | } | 4670 | } |
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1, | |||
4403 | 4686 | ||
4404 | static noinline_for_stack int | 4687 | static noinline_for_stack int |
4405 | ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | 4688 | ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, |
4406 | ext4_group_t group, ext4_grpblk_t block, int count) | 4689 | struct ext4_free_data *new_entry) |
4407 | { | 4690 | { |
4691 | ext4_grpblk_t block; | ||
4692 | struct ext4_free_data *entry; | ||
4408 | struct ext4_group_info *db = e4b->bd_info; | 4693 | struct ext4_group_info *db = e4b->bd_info; |
4409 | struct super_block *sb = e4b->bd_sb; | 4694 | struct super_block *sb = e4b->bd_sb; |
4410 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 4695 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
4411 | struct ext4_free_data *entry, *new_entry; | ||
4412 | struct rb_node **n = &db->bb_free_root.rb_node, *node; | 4696 | struct rb_node **n = &db->bb_free_root.rb_node, *node; |
4413 | struct rb_node *parent = NULL, *new_node; | 4697 | struct rb_node *parent = NULL, *new_node; |
4414 | 4698 | ||
4415 | 4699 | BUG_ON(!ext4_handle_valid(handle)); | |
4416 | BUG_ON(e4b->bd_bitmap_page == NULL); | 4700 | BUG_ON(e4b->bd_bitmap_page == NULL); |
4417 | BUG_ON(e4b->bd_buddy_page == NULL); | 4701 | BUG_ON(e4b->bd_buddy_page == NULL); |
4418 | 4702 | ||
4419 | new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); | ||
4420 | new_entry->start_blk = block; | ||
4421 | new_entry->group = group; | ||
4422 | new_entry->count = count; | ||
4423 | new_entry->t_tid = handle->h_transaction->t_tid; | ||
4424 | new_node = &new_entry->node; | 4703 | new_node = &new_entry->node; |
4704 | block = new_entry->start_blk; | ||
4425 | 4705 | ||
4426 | ext4_lock_group(sb, group); | ||
4427 | if (!*n) { | 4706 | if (!*n) { |
4428 | /* first free block exent. We need to | 4707 | /* first free block exent. We need to |
4429 | protect buddy cache from being freed, | 4708 | protect buddy cache from being freed, |
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | |||
4441 | else if (block >= (entry->start_blk + entry->count)) | 4720 | else if (block >= (entry->start_blk + entry->count)) |
4442 | n = &(*n)->rb_right; | 4721 | n = &(*n)->rb_right; |
4443 | else { | 4722 | else { |
4444 | ext4_unlock_group(sb, group); | 4723 | ext4_grp_locked_error(sb, e4b->bd_group, __func__, |
4445 | ext4_error(sb, __func__, | 4724 | "Double free of blocks %d (%d %d)", |
4446 | "Double free of blocks %d (%d %d)\n", | 4725 | block, entry->start_blk, entry->count); |
4447 | block, entry->start_blk, entry->count); | ||
4448 | return 0; | 4726 | return 0; |
4449 | } | 4727 | } |
4450 | } | 4728 | } |
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | |||
4483 | spin_lock(&sbi->s_md_lock); | 4761 | spin_lock(&sbi->s_md_lock); |
4484 | list_add(&new_entry->list, &handle->h_transaction->t_private_list); | 4762 | list_add(&new_entry->list, &handle->h_transaction->t_private_list); |
4485 | spin_unlock(&sbi->s_md_lock); | 4763 | spin_unlock(&sbi->s_md_lock); |
4486 | ext4_unlock_group(sb, group); | ||
4487 | return 0; | 4764 | return 0; |
4488 | } | 4765 | } |
4489 | 4766 | ||
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, | |||
4499 | struct ext4_allocation_context *ac = NULL; | 4776 | struct ext4_allocation_context *ac = NULL; |
4500 | struct ext4_group_desc *gdp; | 4777 | struct ext4_group_desc *gdp; |
4501 | struct ext4_super_block *es; | 4778 | struct ext4_super_block *es; |
4502 | unsigned long overflow; | 4779 | unsigned int overflow; |
4503 | ext4_grpblk_t bit; | 4780 | ext4_grpblk_t bit; |
4504 | struct buffer_head *gd_bh; | 4781 | struct buffer_head *gd_bh; |
4505 | ext4_group_t block_group; | 4782 | ext4_group_t block_group; |
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, | |||
4522 | } | 4799 | } |
4523 | 4800 | ||
4524 | ext4_debug("freeing block %lu\n", block); | 4801 | ext4_debug("freeing block %lu\n", block); |
4802 | trace_mark(ext4_free_blocks, | ||
4803 | "dev %s block %llu count %lu metadata %d ino %lu", | ||
4804 | sb->s_id, (unsigned long long) block, count, metadata, | ||
4805 | inode ? inode->i_ino : 0); | ||
4525 | 4806 | ||
4526 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | 4807 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); |
4527 | if (ac) { | 4808 | if (ac) { |
@@ -4581,11 +4862,6 @@ do_more: | |||
4581 | err = ext4_journal_get_write_access(handle, gd_bh); | 4862 | err = ext4_journal_get_write_access(handle, gd_bh); |
4582 | if (err) | 4863 | if (err) |
4583 | goto error_return; | 4864 | goto error_return; |
4584 | |||
4585 | err = ext4_mb_load_buddy(sb, block_group, &e4b); | ||
4586 | if (err) | ||
4587 | goto error_return; | ||
4588 | |||
4589 | #ifdef AGGRESSIVE_CHECK | 4865 | #ifdef AGGRESSIVE_CHECK |
4590 | { | 4866 | { |
4591 | int i; | 4867 | int i; |
@@ -4593,13 +4869,6 @@ do_more: | |||
4593 | BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); | 4869 | BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); |
4594 | } | 4870 | } |
4595 | #endif | 4871 | #endif |
4596 | mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
4597 | bit, count); | ||
4598 | |||
4599 | /* We dirtied the bitmap block */ | ||
4600 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
4601 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
4602 | |||
4603 | if (ac) { | 4872 | if (ac) { |
4604 | ac->ac_b_ex.fe_group = block_group; | 4873 | ac->ac_b_ex.fe_group = block_group; |
4605 | ac->ac_b_ex.fe_start = bit; | 4874 | ac->ac_b_ex.fe_start = bit; |
@@ -4607,19 +4876,41 @@ do_more: | |||
4607 | ext4_mb_store_history(ac); | 4876 | ext4_mb_store_history(ac); |
4608 | } | 4877 | } |
4609 | 4878 | ||
4610 | if (metadata) { | 4879 | err = ext4_mb_load_buddy(sb, block_group, &e4b); |
4611 | /* blocks being freed are metadata. these blocks shouldn't | 4880 | if (err) |
4612 | * be used until this transaction is committed */ | 4881 | goto error_return; |
4613 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | 4882 | if (metadata && ext4_handle_valid(handle)) { |
4883 | struct ext4_free_data *new_entry; | ||
4884 | /* | ||
4885 | * blocks being freed are metadata. these blocks shouldn't | ||
4886 | * be used until this transaction is committed | ||
4887 | */ | ||
4888 | new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); | ||
4889 | new_entry->start_blk = bit; | ||
4890 | new_entry->group = block_group; | ||
4891 | new_entry->count = count; | ||
4892 | new_entry->t_tid = handle->h_transaction->t_tid; | ||
4893 | ext4_lock_group(sb, block_group); | ||
4894 | mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
4895 | bit, count); | ||
4896 | ext4_mb_free_metadata(handle, &e4b, new_entry); | ||
4897 | ext4_unlock_group(sb, block_group); | ||
4614 | } else { | 4898 | } else { |
4615 | ext4_lock_group(sb, block_group); | 4899 | ext4_lock_group(sb, block_group); |
4900 | /* need to update group_info->bb_free and bitmap | ||
4901 | * with group lock held. generate_buddy look at | ||
4902 | * them with group lock_held | ||
4903 | */ | ||
4904 | mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
4905 | bit, count); | ||
4616 | mb_free_blocks(inode, &e4b, bit, count); | 4906 | mb_free_blocks(inode, &e4b, bit, count); |
4617 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | 4907 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); |
4618 | ext4_unlock_group(sb, block_group); | 4908 | ext4_unlock_group(sb, block_group); |
4619 | } | 4909 | } |
4620 | 4910 | ||
4621 | spin_lock(sb_bgl_lock(sbi, block_group)); | 4911 | spin_lock(sb_bgl_lock(sbi, block_group)); |
4622 | le16_add_cpu(&gdp->bg_free_blocks_count, count); | 4912 | ret = ext4_free_blks_count(sb, gdp) + count; |
4913 | ext4_free_blks_set(sb, gdp, ret); | ||
4623 | gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); | 4914 | gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); |
4624 | spin_unlock(sb_bgl_lock(sbi, block_group)); | 4915 | spin_unlock(sb_bgl_lock(sbi, block_group)); |
4625 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | 4916 | percpu_counter_add(&sbi->s_freeblocks_counter, count); |
@@ -4635,9 +4926,13 @@ do_more: | |||
4635 | 4926 | ||
4636 | *freed += count; | 4927 | *freed += count; |
4637 | 4928 | ||
4929 | /* We dirtied the bitmap block */ | ||
4930 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
4931 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); | ||
4932 | |||
4638 | /* And the group descriptor block */ | 4933 | /* And the group descriptor block */ |
4639 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | 4934 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); |
4640 | ret = ext4_journal_dirty_metadata(handle, gd_bh); | 4935 | ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); |
4641 | if (!err) | 4936 | if (!err) |
4642 | err = ret; | 4937 | err = ret; |
4643 | 4938 | ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b5dff1fff1e5..10a2921baf14 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/version.h> | 20 | #include <linux/version.h> |
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/marker.h> | 22 | #include <linux/marker.h> |
23 | #include <linux/mutex.h> | ||
23 | #include "ext4_jbd2.h" | 24 | #include "ext4_jbd2.h" |
24 | #include "ext4.h" | 25 | #include "ext4.h" |
25 | #include "group.h" | 26 | #include "group.h" |
@@ -98,9 +99,6 @@ | |||
98 | */ | 99 | */ |
99 | #define MB_DEFAULT_GROUP_PREALLOC 512 | 100 | #define MB_DEFAULT_GROUP_PREALLOC 512 |
100 | 101 | ||
101 | static struct kmem_cache *ext4_pspace_cachep; | ||
102 | static struct kmem_cache *ext4_ac_cachep; | ||
103 | static struct kmem_cache *ext4_free_ext_cachep; | ||
104 | 102 | ||
105 | struct ext4_free_data { | 103 | struct ext4_free_data { |
106 | /* this links the free block information from group_info */ | 104 | /* this links the free block information from group_info */ |
@@ -120,26 +118,6 @@ struct ext4_free_data { | |||
120 | tid_t t_tid; | 118 | tid_t t_tid; |
121 | }; | 119 | }; |
122 | 120 | ||
123 | struct ext4_group_info { | ||
124 | unsigned long bb_state; | ||
125 | struct rb_root bb_free_root; | ||
126 | unsigned short bb_first_free; | ||
127 | unsigned short bb_free; | ||
128 | unsigned short bb_fragments; | ||
129 | struct list_head bb_prealloc_list; | ||
130 | #ifdef DOUBLE_CHECK | ||
131 | void *bb_bitmap; | ||
132 | #endif | ||
133 | unsigned short bb_counters[]; | ||
134 | }; | ||
135 | |||
136 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | ||
137 | #define EXT4_GROUP_INFO_LOCKED_BIT 1 | ||
138 | |||
139 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | ||
140 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | ||
141 | |||
142 | |||
143 | struct ext4_prealloc_space { | 121 | struct ext4_prealloc_space { |
144 | struct list_head pa_inode_list; | 122 | struct list_head pa_inode_list; |
145 | struct list_head pa_group_list; | 123 | struct list_head pa_group_list; |
@@ -217,6 +195,11 @@ struct ext4_allocation_context { | |||
217 | __u8 ac_op; /* operation, for history only */ | 195 | __u8 ac_op; /* operation, for history only */ |
218 | struct page *ac_bitmap_page; | 196 | struct page *ac_bitmap_page; |
219 | struct page *ac_buddy_page; | 197 | struct page *ac_buddy_page; |
198 | /* | ||
199 | * pointer to the held semaphore upon successful | ||
200 | * block allocation | ||
201 | */ | ||
202 | struct rw_semaphore *alloc_semp; | ||
220 | struct ext4_prealloc_space *ac_pa; | 203 | struct ext4_prealloc_space *ac_pa; |
221 | struct ext4_locality_group *ac_lg; | 204 | struct ext4_locality_group *ac_lg; |
222 | }; | 205 | }; |
@@ -250,6 +233,7 @@ struct ext4_buddy { | |||
250 | struct super_block *bd_sb; | 233 | struct super_block *bd_sb; |
251 | __u16 bd_blkbits; | 234 | __u16 bd_blkbits; |
252 | ext4_group_t bd_group; | 235 | ext4_group_t bd_group; |
236 | struct rw_semaphore *alloc_semp; | ||
253 | }; | 237 | }; |
254 | #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) | 238 | #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) |
255 | #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) | 239 | #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) |
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) | |||
259 | { | 243 | { |
260 | return; | 244 | return; |
261 | } | 245 | } |
262 | #else | ||
263 | static void ext4_mb_store_history(struct ext4_allocation_context *ac); | ||
264 | #endif | 246 | #endif |
265 | 247 | ||
266 | #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) | 248 | #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) |
267 | 249 | ||
268 | struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); | 250 | struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); |
269 | 251 | static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | |
270 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
271 | ext4_group_t group); | ||
272 | static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
273 | struct ext4_buddy *e4b, sector_t block, | ||
274 | int count); | ||
275 | static void ext4_mb_put_pa(struct ext4_allocation_context *, | ||
276 | struct super_block *, struct ext4_prealloc_space *pa); | ||
277 | static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
278 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
279 | static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); | ||
280 | |||
281 | |||
282 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
283 | { | ||
284 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
285 | |||
286 | bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
287 | } | ||
288 | |||
289 | static inline void ext4_unlock_group(struct super_block *sb, | ||
290 | ext4_group_t group) | ||
291 | { | ||
292 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
293 | |||
294 | bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
295 | } | ||
296 | |||
297 | static inline int ext4_is_group_locked(struct super_block *sb, | ||
298 | ext4_group_t group) | ||
299 | { | ||
300 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
301 | |||
302 | return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, | ||
303 | &(grinfo->bb_state)); | ||
304 | } | ||
305 | |||
306 | static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | ||
307 | struct ext4_free_extent *fex) | 252 | struct ext4_free_extent *fex) |
308 | { | 253 | { |
309 | ext4_fsblk_t block; | 254 | ext4_fsblk_t block; |
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f2a9cf498ecd..734abca25e35 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c | |||
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode, | |||
59 | /* | 59 | /* |
60 | * Make sure the credit we accumalated is not really high | 60 | * Make sure the credit we accumalated is not really high |
61 | */ | 61 | */ |
62 | if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { | 62 | if (needed && ext4_handle_has_enough_credits(handle, |
63 | EXT4_RESERVE_TRANS_BLOCKS)) { | ||
63 | retval = ext4_journal_restart(handle, needed); | 64 | retval = ext4_journal_restart(handle, needed); |
64 | if (retval) | 65 | if (retval) |
65 | goto err_out; | 66 | goto err_out; |
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) | |||
229 | { | 230 | { |
230 | int retval = 0, needed; | 231 | int retval = 0, needed; |
231 | 232 | ||
232 | if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) | 233 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) |
233 | return 0; | 234 | return 0; |
234 | /* | 235 | /* |
235 | * We are freeing a blocks. During this we touch | 236 | * We are freeing a blocks. During this we touch |
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode) | |||
458 | struct list_blocks_struct lb; | 459 | struct list_blocks_struct lb; |
459 | unsigned long max_entries; | 460 | unsigned long max_entries; |
460 | 461 | ||
461 | if (!test_opt(inode->i_sb, EXTENTS)) | 462 | /* |
462 | /* | 463 | * If the filesystem does not support extents, or the inode |
463 | * if mounted with noextents we don't allow the migrate | 464 | * already is extent-based, error out. |
464 | */ | 465 | */ |
465 | return -EINVAL; | 466 | if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, |
466 | 467 | EXT4_FEATURE_INCOMPAT_EXTENTS) || | |
467 | if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | 468 | (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) |
468 | return -EINVAL; | 469 | return -EINVAL; |
469 | 470 | ||
470 | if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) | 471 | if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9fd2a5e1be4d..fec0b4c2f5f1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle, | |||
74 | #define assert(test) J_ASSERT(test) | 74 | #define assert(test) J_ASSERT(test) |
75 | #endif | 75 | #endif |
76 | 76 | ||
77 | #ifndef swap | ||
78 | #define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) | ||
79 | #endif | ||
80 | |||
81 | #ifdef DX_DEBUG | 77 | #ifdef DX_DEBUG |
82 | #define dxtrace(command) command | 78 | #define dxtrace(command) command |
83 | #else | 79 | #else |
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, | |||
372 | goto fail; | 368 | goto fail; |
373 | } | 369 | } |
374 | hinfo->hash_version = root->info.hash_version; | 370 | hinfo->hash_version = root->info.hash_version; |
371 | if (hinfo->hash_version <= DX_HASH_TEA) | ||
372 | hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
375 | hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; | 373 | hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; |
376 | if (d_name) | 374 | if (d_name) |
377 | ext4fs_dirhash(d_name->name, d_name->len, hinfo); | 375 | ext4fs_dirhash(d_name->name, d_name->len, hinfo); |
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, | |||
641 | dir = dir_file->f_path.dentry->d_inode; | 639 | dir = dir_file->f_path.dentry->d_inode; |
642 | if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { | 640 | if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { |
643 | hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; | 641 | hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; |
642 | if (hinfo.hash_version <= DX_HASH_TEA) | ||
643 | hinfo.hash_version += | ||
644 | EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
644 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; | 645 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; |
645 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, | 646 | count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, |
646 | start_hash, start_minor_hash); | 647 | start_hash, start_minor_hash); |
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name, | |||
806 | static inline int search_dirblock(struct buffer_head *bh, | 807 | static inline int search_dirblock(struct buffer_head *bh, |
807 | struct inode *dir, | 808 | struct inode *dir, |
808 | const struct qstr *d_name, | 809 | const struct qstr *d_name, |
809 | unsigned long offset, | 810 | unsigned int offset, |
810 | struct ext4_dir_entry_2 ** res_dir) | 811 | struct ext4_dir_entry_2 ** res_dir) |
811 | { | 812 | { |
812 | struct ext4_dir_entry_2 * de; | 813 | struct ext4_dir_entry_2 * de; |
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru | |||
1043 | bh = ext4_find_entry(dir, &dentry->d_name, &de); | 1044 | bh = ext4_find_entry(dir, &dentry->d_name, &de); |
1044 | inode = NULL; | 1045 | inode = NULL; |
1045 | if (bh) { | 1046 | if (bh) { |
1046 | unsigned long ino = le32_to_cpu(de->inode); | 1047 | __u32 ino = le32_to_cpu(de->inode); |
1047 | brelse(bh); | 1048 | brelse(bh); |
1048 | if (!ext4_valid_inum(dir->i_sb, ino)) { | 1049 | if (!ext4_valid_inum(dir->i_sb, ino)) { |
1049 | ext4_error(dir->i_sb, "ext4_lookup", | 1050 | ext4_error(dir->i_sb, "ext4_lookup", |
1050 | "bad inode number: %lu", ino); | 1051 | "bad inode number: %u", ino); |
1051 | return ERR_PTR(-EIO); | 1052 | return ERR_PTR(-EIO); |
1052 | } | 1053 | } |
1053 | inode = ext4_iget(dir->i_sb, ino); | 1054 | inode = ext4_iget(dir->i_sb, ino); |
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru | |||
1060 | 1061 | ||
1061 | struct dentry *ext4_get_parent(struct dentry *child) | 1062 | struct dentry *ext4_get_parent(struct dentry *child) |
1062 | { | 1063 | { |
1063 | unsigned long ino; | 1064 | __u32 ino; |
1064 | struct inode *inode; | 1065 | struct inode *inode; |
1065 | static const struct qstr dotdot = { | 1066 | static const struct qstr dotdot = { |
1066 | .name = "..", | 1067 | .name = "..", |
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child) | |||
1078 | 1079 | ||
1079 | if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { | 1080 | if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { |
1080 | ext4_error(child->d_inode->i_sb, "ext4_get_parent", | 1081 | ext4_error(child->d_inode->i_sb, "ext4_get_parent", |
1081 | "bad inode number: %lu", ino); | 1082 | "bad inode number: %u", ino); |
1082 | return ERR_PTR(-EIO); | 1083 | return ERR_PTR(-EIO); |
1083 | } | 1084 | } |
1084 | 1085 | ||
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, | |||
1166 | u32 hash2; | 1167 | u32 hash2; |
1167 | struct dx_map_entry *map; | 1168 | struct dx_map_entry *map; |
1168 | char *data1 = (*bh)->b_data, *data2; | 1169 | char *data1 = (*bh)->b_data, *data2; |
1169 | unsigned split, move, size, i; | 1170 | unsigned split, move, size; |
1170 | struct ext4_dir_entry_2 *de = NULL, *de2; | 1171 | struct ext4_dir_entry_2 *de = NULL, *de2; |
1171 | int err = 0; | 1172 | int err = 0, i; |
1172 | 1173 | ||
1173 | bh2 = ext4_append (handle, dir, &newblock, &err); | 1174 | bh2 = ext4_append (handle, dir, &newblock, &err); |
1174 | if (!(bh2)) { | 1175 | if (!(bh2)) { |
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, | |||
1228 | de = de2; | 1229 | de = de2; |
1229 | } | 1230 | } |
1230 | dx_insert_block(frame, hash2 + continued, newblock); | 1231 | dx_insert_block(frame, hash2 + continued, newblock); |
1231 | err = ext4_journal_dirty_metadata(handle, bh2); | 1232 | err = ext4_handle_dirty_metadata(handle, dir, bh2); |
1232 | if (err) | 1233 | if (err) |
1233 | goto journal_error; | 1234 | goto journal_error; |
1234 | err = ext4_journal_dirty_metadata(handle, frame->bh); | 1235 | err = ext4_handle_dirty_metadata(handle, dir, frame->bh); |
1235 | if (err) | 1236 | if (err) |
1236 | goto journal_error; | 1237 | goto journal_error; |
1237 | brelse(bh2); | 1238 | brelse(bh2); |
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | |||
1266 | struct inode *dir = dentry->d_parent->d_inode; | 1267 | struct inode *dir = dentry->d_parent->d_inode; |
1267 | const char *name = dentry->d_name.name; | 1268 | const char *name = dentry->d_name.name; |
1268 | int namelen = dentry->d_name.len; | 1269 | int namelen = dentry->d_name.len; |
1269 | unsigned long offset = 0; | 1270 | unsigned int offset = 0; |
1270 | unsigned short reclen; | 1271 | unsigned short reclen; |
1271 | int nlen, rlen, err; | 1272 | int nlen, rlen, err; |
1272 | char *top; | 1273 | char *top; |
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | |||
1335 | ext4_update_dx_flag(dir); | 1336 | ext4_update_dx_flag(dir); |
1336 | dir->i_version++; | 1337 | dir->i_version++; |
1337 | ext4_mark_inode_dirty(handle, dir); | 1338 | ext4_mark_inode_dirty(handle, dir); |
1338 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 1339 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
1339 | err = ext4_journal_dirty_metadata(handle, bh); | 1340 | err = ext4_handle_dirty_metadata(handle, dir, bh); |
1340 | if (err) | 1341 | if (err) |
1341 | ext4_std_error(dir->i_sb, err); | 1342 | ext4_std_error(dir->i_sb, err); |
1342 | brelse(bh); | 1343 | brelse(bh); |
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | |||
1408 | 1409 | ||
1409 | /* Initialize as for dx_probe */ | 1410 | /* Initialize as for dx_probe */ |
1410 | hinfo.hash_version = root->info.hash_version; | 1411 | hinfo.hash_version = root->info.hash_version; |
1412 | if (hinfo.hash_version <= DX_HASH_TEA) | ||
1413 | hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; | ||
1411 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; | 1414 | hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; |
1412 | ext4fs_dirhash(name, namelen, &hinfo); | 1415 | ext4fs_dirhash(name, namelen, &hinfo); |
1413 | frame = frames; | 1416 | frame = frames; |
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, | |||
1437 | struct inode *inode) | 1440 | struct inode *inode) |
1438 | { | 1441 | { |
1439 | struct inode *dir = dentry->d_parent->d_inode; | 1442 | struct inode *dir = dentry->d_parent->d_inode; |
1440 | unsigned long offset; | ||
1441 | struct buffer_head *bh; | 1443 | struct buffer_head *bh; |
1442 | struct ext4_dir_entry_2 *de; | 1444 | struct ext4_dir_entry_2 *de; |
1443 | struct super_block *sb; | 1445 | struct super_block *sb; |
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, | |||
1459 | ext4_mark_inode_dirty(handle, dir); | 1461 | ext4_mark_inode_dirty(handle, dir); |
1460 | } | 1462 | } |
1461 | blocks = dir->i_size >> sb->s_blocksize_bits; | 1463 | blocks = dir->i_size >> sb->s_blocksize_bits; |
1462 | for (block = 0, offset = 0; block < blocks; block++) { | 1464 | for (block = 0; block < blocks; block++) { |
1463 | bh = ext4_bread(handle, dir, block, 0, &retval); | 1465 | bh = ext4_bread(handle, dir, block, 0, &retval); |
1464 | if(!bh) | 1466 | if(!bh) |
1465 | return retval; | 1467 | return retval; |
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1574 | dxtrace(dx_show_index("node", frames[1].entries)); | 1576 | dxtrace(dx_show_index("node", frames[1].entries)); |
1575 | dxtrace(dx_show_index("node", | 1577 | dxtrace(dx_show_index("node", |
1576 | ((struct dx_node *) bh2->b_data)->entries)); | 1578 | ((struct dx_node *) bh2->b_data)->entries)); |
1577 | err = ext4_journal_dirty_metadata(handle, bh2); | 1579 | err = ext4_handle_dirty_metadata(handle, inode, bh2); |
1578 | if (err) | 1580 | if (err) |
1579 | goto journal_error; | 1581 | goto journal_error; |
1580 | brelse (bh2); | 1582 | brelse (bh2); |
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1600 | if (err) | 1602 | if (err) |
1601 | goto journal_error; | 1603 | goto journal_error; |
1602 | } | 1604 | } |
1603 | ext4_journal_dirty_metadata(handle, frames[0].bh); | 1605 | ext4_handle_dirty_metadata(handle, inode, frames[0].bh); |
1604 | } | 1606 | } |
1605 | de = do_split(handle, dir, &bh, frame, &hinfo, &err); | 1607 | de = do_split(handle, dir, &bh, frame, &hinfo, &err); |
1606 | if (!de) | 1608 | if (!de) |
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle, | |||
1646 | else | 1648 | else |
1647 | de->inode = 0; | 1649 | de->inode = 0; |
1648 | dir->i_version++; | 1650 | dir->i_version++; |
1649 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 1651 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
1650 | ext4_journal_dirty_metadata(handle, bh); | 1652 | ext4_handle_dirty_metadata(handle, dir, bh); |
1651 | return 0; | 1653 | return 0; |
1652 | } | 1654 | } |
1653 | i += ext4_rec_len_from_disk(de->rec_len); | 1655 | i += ext4_rec_len_from_disk(de->rec_len); |
@@ -1725,7 +1727,7 @@ retry: | |||
1725 | return PTR_ERR(handle); | 1727 | return PTR_ERR(handle); |
1726 | 1728 | ||
1727 | if (IS_DIRSYNC(dir)) | 1729 | if (IS_DIRSYNC(dir)) |
1728 | handle->h_sync = 1; | 1730 | ext4_handle_sync(handle); |
1729 | 1731 | ||
1730 | inode = ext4_new_inode (handle, dir, mode); | 1732 | inode = ext4_new_inode (handle, dir, mode); |
1731 | err = PTR_ERR(inode); | 1733 | err = PTR_ERR(inode); |
@@ -1759,7 +1761,7 @@ retry: | |||
1759 | return PTR_ERR(handle); | 1761 | return PTR_ERR(handle); |
1760 | 1762 | ||
1761 | if (IS_DIRSYNC(dir)) | 1763 | if (IS_DIRSYNC(dir)) |
1762 | handle->h_sync = 1; | 1764 | ext4_handle_sync(handle); |
1763 | 1765 | ||
1764 | inode = ext4_new_inode(handle, dir, mode); | 1766 | inode = ext4_new_inode(handle, dir, mode); |
1765 | err = PTR_ERR(inode); | 1767 | err = PTR_ERR(inode); |
@@ -1795,7 +1797,7 @@ retry: | |||
1795 | return PTR_ERR(handle); | 1797 | return PTR_ERR(handle); |
1796 | 1798 | ||
1797 | if (IS_DIRSYNC(dir)) | 1799 | if (IS_DIRSYNC(dir)) |
1798 | handle->h_sync = 1; | 1800 | ext4_handle_sync(handle); |
1799 | 1801 | ||
1800 | inode = ext4_new_inode(handle, dir, S_IFDIR | mode); | 1802 | inode = ext4_new_inode(handle, dir, S_IFDIR | mode); |
1801 | err = PTR_ERR(inode); | 1803 | err = PTR_ERR(inode); |
@@ -1824,8 +1826,8 @@ retry: | |||
1824 | strcpy(de->name, ".."); | 1826 | strcpy(de->name, ".."); |
1825 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); | 1827 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); |
1826 | inode->i_nlink = 2; | 1828 | inode->i_nlink = 2; |
1827 | BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); | 1829 | BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); |
1828 | ext4_journal_dirty_metadata(handle, dir_block); | 1830 | ext4_handle_dirty_metadata(handle, dir, dir_block); |
1829 | brelse(dir_block); | 1831 | brelse(dir_block); |
1830 | ext4_mark_inode_dirty(handle, inode); | 1832 | ext4_mark_inode_dirty(handle, inode); |
1831 | err = ext4_add_entry(handle, dentry, inode); | 1833 | err = ext4_add_entry(handle, dentry, inode); |
@@ -1854,7 +1856,7 @@ out_stop: | |||
1854 | */ | 1856 | */ |
1855 | static int empty_dir(struct inode *inode) | 1857 | static int empty_dir(struct inode *inode) |
1856 | { | 1858 | { |
1857 | unsigned long offset; | 1859 | unsigned int offset; |
1858 | struct buffer_head *bh; | 1860 | struct buffer_head *bh; |
1859 | struct ext4_dir_entry_2 *de, *de1; | 1861 | struct ext4_dir_entry_2 *de, *de1; |
1860 | struct super_block *sb; | 1862 | struct super_block *sb; |
@@ -1899,7 +1901,7 @@ static int empty_dir(struct inode *inode) | |||
1899 | if (err) | 1901 | if (err) |
1900 | ext4_error(sb, __func__, | 1902 | ext4_error(sb, __func__, |
1901 | "error %d reading directory" | 1903 | "error %d reading directory" |
1902 | " #%lu offset %lu", | 1904 | " #%lu offset %u", |
1903 | err, inode->i_ino, offset); | 1905 | err, inode->i_ino, offset); |
1904 | offset += sb->s_blocksize; | 1906 | offset += sb->s_blocksize; |
1905 | continue; | 1907 | continue; |
@@ -1937,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) | |||
1937 | struct ext4_iloc iloc; | 1939 | struct ext4_iloc iloc; |
1938 | int err = 0, rc; | 1940 | int err = 0, rc; |
1939 | 1941 | ||
1942 | if (!ext4_handle_valid(handle)) | ||
1943 | return 0; | ||
1944 | |||
1940 | lock_super(sb); | 1945 | lock_super(sb); |
1941 | if (!list_empty(&EXT4_I(inode)->i_orphan)) | 1946 | if (!list_empty(&EXT4_I(inode)->i_orphan)) |
1942 | goto out_unlock; | 1947 | goto out_unlock; |
@@ -1965,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) | |||
1965 | /* Insert this inode at the head of the on-disk orphan list... */ | 1970 | /* Insert this inode at the head of the on-disk orphan list... */ |
1966 | NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); | 1971 | NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); |
1967 | EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); | 1972 | EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); |
1968 | err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | 1973 | err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); |
1969 | rc = ext4_mark_iloc_dirty(handle, inode, &iloc); | 1974 | rc = ext4_mark_iloc_dirty(handle, inode, &iloc); |
1970 | if (!err) | 1975 | if (!err) |
1971 | err = rc; | 1976 | err = rc; |
@@ -1999,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) | |||
1999 | struct list_head *prev; | 2004 | struct list_head *prev; |
2000 | struct ext4_inode_info *ei = EXT4_I(inode); | 2005 | struct ext4_inode_info *ei = EXT4_I(inode); |
2001 | struct ext4_sb_info *sbi; | 2006 | struct ext4_sb_info *sbi; |
2002 | unsigned long ino_next; | 2007 | __u32 ino_next; |
2003 | struct ext4_iloc iloc; | 2008 | struct ext4_iloc iloc; |
2004 | int err = 0; | 2009 | int err = 0; |
2005 | 2010 | ||
2011 | if (!ext4_handle_valid(handle)) | ||
2012 | return 0; | ||
2013 | |||
2006 | lock_super(inode->i_sb); | 2014 | lock_super(inode->i_sb); |
2007 | if (list_empty(&ei->i_orphan)) { | 2015 | if (list_empty(&ei->i_orphan)) { |
2008 | unlock_super(inode->i_sb); | 2016 | unlock_super(inode->i_sb); |
@@ -2021,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) | |||
2021 | * transaction handle with which to update the orphan list on | 2029 | * transaction handle with which to update the orphan list on |
2022 | * disk, but we still need to remove the inode from the linked | 2030 | * disk, but we still need to remove the inode from the linked |
2023 | * list in memory. */ | 2031 | * list in memory. */ |
2024 | if (!handle) | 2032 | if (sbi->s_journal && !handle) |
2025 | goto out; | 2033 | goto out; |
2026 | 2034 | ||
2027 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 2035 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
@@ -2029,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) | |||
2029 | goto out_err; | 2037 | goto out_err; |
2030 | 2038 | ||
2031 | if (prev == &sbi->s_orphan) { | 2039 | if (prev == &sbi->s_orphan) { |
2032 | jbd_debug(4, "superblock will point to %lu\n", ino_next); | 2040 | jbd_debug(4, "superblock will point to %u\n", ino_next); |
2033 | BUFFER_TRACE(sbi->s_sbh, "get_write_access"); | 2041 | BUFFER_TRACE(sbi->s_sbh, "get_write_access"); |
2034 | err = ext4_journal_get_write_access(handle, sbi->s_sbh); | 2042 | err = ext4_journal_get_write_access(handle, sbi->s_sbh); |
2035 | if (err) | 2043 | if (err) |
2036 | goto out_brelse; | 2044 | goto out_brelse; |
2037 | sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); | 2045 | sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); |
2038 | err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); | 2046 | err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); |
2039 | } else { | 2047 | } else { |
2040 | struct ext4_iloc iloc2; | 2048 | struct ext4_iloc iloc2; |
2041 | struct inode *i_prev = | 2049 | struct inode *i_prev = |
2042 | &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; | 2050 | &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; |
2043 | 2051 | ||
2044 | jbd_debug(4, "orphan inode %lu will point to %lu\n", | 2052 | jbd_debug(4, "orphan inode %lu will point to %u\n", |
2045 | i_prev->i_ino, ino_next); | 2053 | i_prev->i_ino, ino_next); |
2046 | err = ext4_reserve_inode_write(handle, i_prev, &iloc2); | 2054 | err = ext4_reserve_inode_write(handle, i_prev, &iloc2); |
2047 | if (err) | 2055 | if (err) |
@@ -2086,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) | |||
2086 | goto end_rmdir; | 2094 | goto end_rmdir; |
2087 | 2095 | ||
2088 | if (IS_DIRSYNC(dir)) | 2096 | if (IS_DIRSYNC(dir)) |
2089 | handle->h_sync = 1; | 2097 | ext4_handle_sync(handle); |
2090 | 2098 | ||
2091 | inode = dentry->d_inode; | 2099 | inode = dentry->d_inode; |
2092 | 2100 | ||
@@ -2140,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) | |||
2140 | return PTR_ERR(handle); | 2148 | return PTR_ERR(handle); |
2141 | 2149 | ||
2142 | if (IS_DIRSYNC(dir)) | 2150 | if (IS_DIRSYNC(dir)) |
2143 | handle->h_sync = 1; | 2151 | ext4_handle_sync(handle); |
2144 | 2152 | ||
2145 | retval = -ENOENT; | 2153 | retval = -ENOENT; |
2146 | bh = ext4_find_entry(dir, &dentry->d_name, &de); | 2154 | bh = ext4_find_entry(dir, &dentry->d_name, &de); |
@@ -2197,7 +2205,7 @@ retry: | |||
2197 | return PTR_ERR(handle); | 2205 | return PTR_ERR(handle); |
2198 | 2206 | ||
2199 | if (IS_DIRSYNC(dir)) | 2207 | if (IS_DIRSYNC(dir)) |
2200 | handle->h_sync = 1; | 2208 | ext4_handle_sync(handle); |
2201 | 2209 | ||
2202 | inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); | 2210 | inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); |
2203 | err = PTR_ERR(inode); | 2211 | err = PTR_ERR(inode); |
@@ -2260,7 +2268,7 @@ retry: | |||
2260 | return PTR_ERR(handle); | 2268 | return PTR_ERR(handle); |
2261 | 2269 | ||
2262 | if (IS_DIRSYNC(dir)) | 2270 | if (IS_DIRSYNC(dir)) |
2263 | handle->h_sync = 1; | 2271 | ext4_handle_sync(handle); |
2264 | 2272 | ||
2265 | inode->i_ctime = ext4_current_time(inode); | 2273 | inode->i_ctime = ext4_current_time(inode); |
2266 | ext4_inc_count(handle, inode); | 2274 | ext4_inc_count(handle, inode); |
@@ -2309,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2309 | return PTR_ERR(handle); | 2317 | return PTR_ERR(handle); |
2310 | 2318 | ||
2311 | if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) | 2319 | if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) |
2312 | handle->h_sync = 1; | 2320 | ext4_handle_sync(handle); |
2313 | 2321 | ||
2314 | old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); | 2322 | old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); |
2315 | /* | 2323 | /* |
@@ -2363,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2363 | new_dir->i_ctime = new_dir->i_mtime = | 2371 | new_dir->i_ctime = new_dir->i_mtime = |
2364 | ext4_current_time(new_dir); | 2372 | ext4_current_time(new_dir); |
2365 | ext4_mark_inode_dirty(handle, new_dir); | 2373 | ext4_mark_inode_dirty(handle, new_dir); |
2366 | BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); | 2374 | BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); |
2367 | ext4_journal_dirty_metadata(handle, new_bh); | 2375 | ext4_handle_dirty_metadata(handle, new_dir, new_bh); |
2368 | brelse(new_bh); | 2376 | brelse(new_bh); |
2369 | new_bh = NULL; | 2377 | new_bh = NULL; |
2370 | } | 2378 | } |
@@ -2414,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2414 | BUFFER_TRACE(dir_bh, "get_write_access"); | 2422 | BUFFER_TRACE(dir_bh, "get_write_access"); |
2415 | ext4_journal_get_write_access(handle, dir_bh); | 2423 | ext4_journal_get_write_access(handle, dir_bh); |
2416 | PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); | 2424 | PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); |
2417 | BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); | 2425 | BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); |
2418 | ext4_journal_dirty_metadata(handle, dir_bh); | 2426 | ext4_handle_dirty_metadata(handle, old_dir, dir_bh); |
2419 | ext4_dec_count(handle, old_dir); | 2427 | ext4_dec_count(handle, old_dir); |
2420 | if (new_inode) { | 2428 | if (new_inode) { |
2421 | /* checked empty_dir above, can't have another parent, | 2429 | /* checked empty_dir above, can't have another parent, |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6ec1843a015..c328be5d6885 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb, | |||
50 | ext4_get_group_no_and_offset(sb, start, NULL, &offset); | 50 | ext4_get_group_no_and_offset(sb, start, NULL, &offset); |
51 | if (group != sbi->s_groups_count) | 51 | if (group != sbi->s_groups_count) |
52 | ext4_warning(sb, __func__, | 52 | ext4_warning(sb, __func__, |
53 | "Cannot add at group %u (only %lu groups)", | 53 | "Cannot add at group %u (only %u groups)", |
54 | input->group, sbi->s_groups_count); | 54 | input->group, sbi->s_groups_count); |
55 | else if (offset != 0) | 55 | else if (offset != 0) |
56 | ext4_warning(sb, __func__, "Last group not full"); | 56 | ext4_warning(sb, __func__, "Last group not full"); |
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, | |||
149 | { | 149 | { |
150 | int err; | 150 | int err; |
151 | 151 | ||
152 | if (handle->h_buffer_credits >= thresh) | 152 | if (ext4_handle_has_enough_credits(handle, thresh)) |
153 | return 0; | 153 | return 0; |
154 | 154 | ||
155 | err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); | 155 | err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); |
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
232 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); | 232 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); |
233 | set_buffer_uptodate(gdb); | 233 | set_buffer_uptodate(gdb); |
234 | unlock_buffer(gdb); | 234 | unlock_buffer(gdb); |
235 | ext4_journal_dirty_metadata(handle, gdb); | 235 | ext4_handle_dirty_metadata(handle, NULL, gdb); |
236 | ext4_set_bit(bit, bh->b_data); | 236 | ext4_set_bit(bit, bh->b_data); |
237 | brelse(gdb); | 237 | brelse(gdb); |
238 | } | 238 | } |
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
251 | err = PTR_ERR(bh); | 251 | err = PTR_ERR(bh); |
252 | goto exit_bh; | 252 | goto exit_bh; |
253 | } | 253 | } |
254 | ext4_journal_dirty_metadata(handle, gdb); | 254 | ext4_handle_dirty_metadata(handle, NULL, gdb); |
255 | ext4_set_bit(bit, bh->b_data); | 255 | ext4_set_bit(bit, bh->b_data); |
256 | brelse(gdb); | 256 | brelse(gdb); |
257 | } | 257 | } |
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
276 | err = PTR_ERR(it); | 276 | err = PTR_ERR(it); |
277 | goto exit_bh; | 277 | goto exit_bh; |
278 | } | 278 | } |
279 | ext4_journal_dirty_metadata(handle, it); | 279 | ext4_handle_dirty_metadata(handle, NULL, it); |
280 | brelse(it); | 280 | brelse(it); |
281 | ext4_set_bit(bit, bh->b_data); | 281 | ext4_set_bit(bit, bh->b_data); |
282 | } | 282 | } |
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
284 | if ((err = extend_or_restart_transaction(handle, 2, bh))) | 284 | if ((err = extend_or_restart_transaction(handle, 2, bh))) |
285 | goto exit_bh; | 285 | goto exit_bh; |
286 | 286 | ||
287 | mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), | 287 | mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); |
288 | bh->b_data); | 288 | ext4_handle_dirty_metadata(handle, NULL, bh); |
289 | ext4_journal_dirty_metadata(handle, bh); | ||
290 | brelse(bh); | 289 | brelse(bh); |
291 | |||
292 | /* Mark unused entries in inode bitmap used */ | 290 | /* Mark unused entries in inode bitmap used */ |
293 | ext4_debug("clear inode bitmap %#04llx (+%llu)\n", | 291 | ext4_debug("clear inode bitmap %#04llx (+%llu)\n", |
294 | input->inode_bitmap, input->inode_bitmap - start); | 292 | input->inode_bitmap, input->inode_bitmap - start); |
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
297 | goto exit_journal; | 295 | goto exit_journal; |
298 | } | 296 | } |
299 | 297 | ||
300 | mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), | 298 | mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, |
301 | bh->b_data); | 299 | bh->b_data); |
302 | ext4_journal_dirty_metadata(handle, bh); | 300 | ext4_handle_dirty_metadata(handle, NULL, bh); |
303 | exit_bh: | 301 | exit_bh: |
304 | brelse(bh); | 302 | brelse(bh); |
305 | 303 | ||
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
486 | * reserved inode, and will become GDT blocks (primary and backup). | 484 | * reserved inode, and will become GDT blocks (primary and backup). |
487 | */ | 485 | */ |
488 | data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; | 486 | data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; |
489 | ext4_journal_dirty_metadata(handle, dind); | 487 | ext4_handle_dirty_metadata(handle, NULL, dind); |
490 | brelse(dind); | 488 | brelse(dind); |
491 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; | 489 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; |
492 | ext4_mark_iloc_dirty(handle, inode, &iloc); | 490 | ext4_mark_iloc_dirty(handle, inode, &iloc); |
493 | memset((*primary)->b_data, 0, sb->s_blocksize); | 491 | memset((*primary)->b_data, 0, sb->s_blocksize); |
494 | ext4_journal_dirty_metadata(handle, *primary); | 492 | ext4_handle_dirty_metadata(handle, NULL, *primary); |
495 | 493 | ||
496 | o_group_desc = EXT4_SB(sb)->s_group_desc; | 494 | o_group_desc = EXT4_SB(sb)->s_group_desc; |
497 | memcpy(n_group_desc, o_group_desc, | 495 | memcpy(n_group_desc, o_group_desc, |
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
502 | kfree(o_group_desc); | 500 | kfree(o_group_desc); |
503 | 501 | ||
504 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); | 502 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); |
505 | ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | 503 | ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
506 | 504 | ||
507 | return 0; | 505 | return 0; |
508 | 506 | ||
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | |||
618 | primary[i]->b_blocknr, gdbackups, | 616 | primary[i]->b_blocknr, gdbackups, |
619 | blk + primary[i]->b_blocknr); */ | 617 | blk + primary[i]->b_blocknr); */ |
620 | data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); | 618 | data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); |
621 | err2 = ext4_journal_dirty_metadata(handle, primary[i]); | 619 | err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); |
622 | if (!err) | 620 | if (!err) |
623 | err = err2; | 621 | err = err2; |
624 | } | 622 | } |
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb, | |||
676 | struct buffer_head *bh; | 674 | struct buffer_head *bh; |
677 | 675 | ||
678 | /* Out of journal space, and can't get more - abort - so sad */ | 676 | /* Out of journal space, and can't get more - abort - so sad */ |
679 | if (handle->h_buffer_credits == 0 && | 677 | if (ext4_handle_valid(handle) && |
678 | handle->h_buffer_credits == 0 && | ||
680 | ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && | 679 | ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && |
681 | (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) | 680 | (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) |
682 | break; | 681 | break; |
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb, | |||
696 | memset(bh->b_data + size, 0, rest); | 695 | memset(bh->b_data + size, 0, rest); |
697 | set_buffer_uptodate(bh); | 696 | set_buffer_uptodate(bh); |
698 | unlock_buffer(bh); | 697 | unlock_buffer(bh); |
699 | ext4_journal_dirty_metadata(handle, bh); | 698 | ext4_handle_dirty_metadata(handle, NULL, bh); |
700 | brelse(bh); | 699 | brelse(bh); |
701 | } | 700 | } |
702 | if ((err2 = ext4_journal_stop(handle)) && !err) | 701 | if ((err2 = ext4_journal_stop(handle)) && !err) |
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb, | |||
715 | exit_err: | 714 | exit_err: |
716 | if (err) { | 715 | if (err) { |
717 | ext4_warning(sb, __func__, | 716 | ext4_warning(sb, __func__, |
718 | "can't update backup for group %lu (err %d), " | 717 | "can't update backup for group %u (err %d), " |
719 | "forcing fsck on next reboot", group, err); | 718 | "forcing fsck on next reboot", group, err); |
720 | sbi->s_mount_state &= ~EXT4_VALID_FS; | 719 | sbi->s_mount_state &= ~EXT4_VALID_FS; |
721 | sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); | 720 | sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); |
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
747 | struct inode *inode = NULL; | 746 | struct inode *inode = NULL; |
748 | handle_t *handle; | 747 | handle_t *handle; |
749 | int gdb_off, gdb_num; | 748 | int gdb_off, gdb_num; |
749 | int num_grp_locked = 0; | ||
750 | int err, err2; | 750 | int err, err2; |
751 | 751 | ||
752 | gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | 752 | gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); |
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
761 | 761 | ||
762 | if (ext4_blocks_count(es) + input->blocks_count < | 762 | if (ext4_blocks_count(es) + input->blocks_count < |
763 | ext4_blocks_count(es)) { | 763 | ext4_blocks_count(es)) { |
764 | ext4_warning(sb, __func__, "blocks_count overflow\n"); | 764 | ext4_warning(sb, __func__, "blocks_count overflow"); |
765 | return -EINVAL; | 765 | return -EINVAL; |
766 | } | 766 | } |
767 | 767 | ||
768 | if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < | 768 | if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < |
769 | le32_to_cpu(es->s_inodes_count)) { | 769 | le32_to_cpu(es->s_inodes_count)) { |
770 | ext4_warning(sb, __func__, "inodes_count overflow\n"); | 770 | ext4_warning(sb, __func__, "inodes_count overflow"); |
771 | return -EINVAL; | 771 | return -EINVAL; |
772 | } | 772 | } |
773 | 773 | ||
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
787 | } | 787 | } |
788 | } | 788 | } |
789 | 789 | ||
790 | |||
790 | if ((err = verify_group_input(sb, input))) | 791 | if ((err = verify_group_input(sb, input))) |
791 | goto exit_put; | 792 | goto exit_put; |
792 | 793 | ||
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
855 | * using the new disk blocks. | 856 | * using the new disk blocks. |
856 | */ | 857 | */ |
857 | 858 | ||
859 | num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); | ||
858 | /* Update group descriptor block for new group */ | 860 | /* Update group descriptor block for new group */ |
859 | gdp = (struct ext4_group_desc *)((char *)primary->b_data + | 861 | gdp = (struct ext4_group_desc *)((char *)primary->b_data + |
860 | gdb_off * EXT4_DESC_SIZE(sb)); | 862 | gdb_off * EXT4_DESC_SIZE(sb)); |
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
862 | ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ | 864 | ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ |
863 | ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ | 865 | ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ |
864 | ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ | 866 | ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ |
865 | gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); | 867 | ext4_free_blks_set(sb, gdp, input->free_blocks_count); |
866 | gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); | 868 | ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); |
869 | gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); | ||
867 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); | 870 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); |
868 | 871 | ||
869 | /* | 872 | /* |
870 | * We can allocate memory for mb_alloc based on the new group | 873 | * We can allocate memory for mb_alloc based on the new group |
871 | * descriptor | 874 | * descriptor |
872 | */ | 875 | */ |
873 | err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); | 876 | err = ext4_mb_add_groupinfo(sb, input->group, gdp); |
874 | if (err) | 877 | if (err) { |
878 | ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); | ||
875 | goto exit_journal; | 879 | goto exit_journal; |
880 | } | ||
876 | 881 | ||
877 | /* | 882 | /* |
878 | * Make the new blocks and inodes valid next. We do this before | 883 | * Make the new blocks and inodes valid next. We do this before |
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
914 | 919 | ||
915 | /* Update the global fs size fields */ | 920 | /* Update the global fs size fields */ |
916 | sbi->s_groups_count++; | 921 | sbi->s_groups_count++; |
922 | ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); | ||
917 | 923 | ||
918 | ext4_journal_dirty_metadata(handle, primary); | 924 | ext4_handle_dirty_metadata(handle, NULL, primary); |
919 | 925 | ||
920 | /* Update the reserved block counts only once the new group is | 926 | /* Update the reserved block counts only once the new group is |
921 | * active. */ | 927 | * active. */ |
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
937 | EXT4_INODES_PER_GROUP(sb); | 943 | EXT4_INODES_PER_GROUP(sb); |
938 | } | 944 | } |
939 | 945 | ||
940 | ext4_journal_dirty_metadata(handle, sbi->s_sbh); | 946 | ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); |
941 | sb->s_dirt = 1; | 947 | sb->s_dirt = 1; |
942 | 948 | ||
943 | exit_journal: | 949 | exit_journal: |
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
975 | struct buffer_head *bh; | 981 | struct buffer_head *bh; |
976 | handle_t *handle; | 982 | handle_t *handle; |
977 | int err; | 983 | int err; |
978 | unsigned long freed_blocks; | ||
979 | ext4_group_t group; | 984 | ext4_group_t group; |
980 | struct ext4_group_info *grp; | ||
981 | 985 | ||
982 | /* We don't need to worry about locking wrt other resizers just | 986 | /* We don't need to worry about locking wrt other resizers just |
983 | * yet: we're going to revalidate es->s_blocks_count after | 987 | * yet: we're going to revalidate es->s_blocks_count after |
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
997 | " too large to resize to %llu blocks safely\n", | 1001 | " too large to resize to %llu blocks safely\n", |
998 | sb->s_id, n_blocks_count); | 1002 | sb->s_id, n_blocks_count); |
999 | if (sizeof(sector_t) < 8) | 1003 | if (sizeof(sector_t) < 8) |
1000 | ext4_warning(sb, __func__, | 1004 | ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); |
1001 | "CONFIG_LBD not enabled\n"); | ||
1002 | return -EINVAL; | 1005 | return -EINVAL; |
1003 | } | 1006 | } |
1004 | 1007 | ||
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1071 | goto exit_put; | 1074 | goto exit_put; |
1072 | } | 1075 | } |
1073 | ext4_blocks_count_set(es, o_blocks_count + add); | 1076 | ext4_blocks_count_set(es, o_blocks_count + add); |
1074 | ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | 1077 | ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
1075 | sb->s_dirt = 1; | 1078 | sb->s_dirt = 1; |
1076 | unlock_super(sb); | 1079 | unlock_super(sb); |
1077 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, | 1080 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, |
1078 | o_blocks_count + add); | 1081 | o_blocks_count + add); |
1079 | ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); | 1082 | /* We add the blocks to the bitmap and set the group need init bit */ |
1083 | ext4_add_groupblocks(handle, sb, o_blocks_count, add); | ||
1080 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | 1084 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, |
1081 | o_blocks_count + add); | 1085 | o_blocks_count + add); |
1082 | if ((err = ext4_journal_stop(handle))) | 1086 | if ((err = ext4_journal_stop(handle))) |
1083 | goto exit_put; | 1087 | goto exit_put; |
1084 | 1088 | ||
1085 | /* | ||
1086 | * Mark mballoc pages as not up to date so that they will be updated | ||
1087 | * next time they are loaded by ext4_mb_load_buddy. | ||
1088 | * | ||
1089 | * XXX Bad, Bad, BAD!!! We should not be overloading the | ||
1090 | * Uptodate flag, particularly on thte bitmap bh, as way of | ||
1091 | * hinting to ext4_mb_load_buddy() that it needs to be | ||
1092 | * overloaded. A user could take a LVM snapshot, then do an | ||
1093 | * on-line fsck, and clear the uptodate flag, and this would | ||
1094 | * not be a bug in userspace, but a bug in the kernel. FIXME!!! | ||
1095 | */ | ||
1096 | { | ||
1097 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1098 | struct inode *inode = sbi->s_buddy_cache; | ||
1099 | int blocks_per_page; | ||
1100 | int block; | ||
1101 | int pnum; | ||
1102 | struct page *page; | ||
1103 | |||
1104 | /* Set buddy page as not up to date */ | ||
1105 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1106 | block = group * 2; | ||
1107 | pnum = block / blocks_per_page; | ||
1108 | page = find_get_page(inode->i_mapping, pnum); | ||
1109 | if (page != NULL) { | ||
1110 | ClearPageUptodate(page); | ||
1111 | page_cache_release(page); | ||
1112 | } | ||
1113 | |||
1114 | /* Set bitmap page as not up to date */ | ||
1115 | block++; | ||
1116 | pnum = block / blocks_per_page; | ||
1117 | page = find_get_page(inode->i_mapping, pnum); | ||
1118 | if (page != NULL) { | ||
1119 | ClearPageUptodate(page); | ||
1120 | page_cache_release(page); | ||
1121 | } | ||
1122 | |||
1123 | /* Get the info on the last group */ | ||
1124 | grp = ext4_get_group_info(sb, group); | ||
1125 | |||
1126 | /* Update free blocks in group info */ | ||
1127 | ext4_mb_update_group_info(grp, add); | ||
1128 | } | ||
1129 | |||
1130 | if (test_opt(sb, DEBUG)) | 1089 | if (test_opt(sb, DEBUG)) |
1131 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", | 1090 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", |
1132 | ext4_blocks_count(es)); | 1091 | ext4_blocks_count(es)); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9494bb249390..e5f06a5f045e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root; | |||
51 | 51 | ||
52 | static int ext4_load_journal(struct super_block *, struct ext4_super_block *, | 52 | static int ext4_load_journal(struct super_block *, struct ext4_super_block *, |
53 | unsigned long journal_devnum); | 53 | unsigned long journal_devnum); |
54 | static int ext4_create_journal(struct super_block *, struct ext4_super_block *, | 54 | static int ext4_commit_super(struct super_block *sb, |
55 | unsigned int); | ||
56 | static void ext4_commit_super(struct super_block *sb, | ||
57 | struct ext4_super_block *es, int sync); | 55 | struct ext4_super_block *es, int sync); |
58 | static void ext4_mark_recovery_complete(struct super_block *sb, | 56 | static void ext4_mark_recovery_complete(struct super_block *sb, |
59 | struct ext4_super_block *es); | 57 | struct ext4_super_block *es); |
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, | |||
64 | char nbuf[16]); | 62 | char nbuf[16]); |
65 | static int ext4_remount(struct super_block *sb, int *flags, char *data); | 63 | static int ext4_remount(struct super_block *sb, int *flags, char *data); |
66 | static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); | 64 | static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); |
67 | static void ext4_unlockfs(struct super_block *sb); | 65 | static int ext4_unfreeze(struct super_block *sb); |
68 | static void ext4_write_super(struct super_block *sb); | 66 | static void ext4_write_super(struct super_block *sb); |
69 | static void ext4_write_super_lockfs(struct super_block *sb); | 67 | static int ext4_freeze(struct super_block *sb); |
70 | 68 | ||
71 | 69 | ||
72 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, | 70 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, |
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, | |||
93 | (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); | 91 | (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); |
94 | } | 92 | } |
95 | 93 | ||
94 | __u32 ext4_free_blks_count(struct super_block *sb, | ||
95 | struct ext4_group_desc *bg) | ||
96 | { | ||
97 | return le16_to_cpu(bg->bg_free_blocks_count_lo) | | ||
98 | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? | ||
99 | (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); | ||
100 | } | ||
101 | |||
102 | __u32 ext4_free_inodes_count(struct super_block *sb, | ||
103 | struct ext4_group_desc *bg) | ||
104 | { | ||
105 | return le16_to_cpu(bg->bg_free_inodes_count_lo) | | ||
106 | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? | ||
107 | (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); | ||
108 | } | ||
109 | |||
110 | __u32 ext4_used_dirs_count(struct super_block *sb, | ||
111 | struct ext4_group_desc *bg) | ||
112 | { | ||
113 | return le16_to_cpu(bg->bg_used_dirs_count_lo) | | ||
114 | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? | ||
115 | (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); | ||
116 | } | ||
117 | |||
118 | __u32 ext4_itable_unused_count(struct super_block *sb, | ||
119 | struct ext4_group_desc *bg) | ||
120 | { | ||
121 | return le16_to_cpu(bg->bg_itable_unused_lo) | | ||
122 | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? | ||
123 | (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); | ||
124 | } | ||
125 | |||
96 | void ext4_block_bitmap_set(struct super_block *sb, | 126 | void ext4_block_bitmap_set(struct super_block *sb, |
97 | struct ext4_group_desc *bg, ext4_fsblk_t blk) | 127 | struct ext4_group_desc *bg, ext4_fsblk_t blk) |
98 | { | 128 | { |
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb, | |||
117 | bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); | 147 | bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); |
118 | } | 148 | } |
119 | 149 | ||
150 | void ext4_free_blks_set(struct super_block *sb, | ||
151 | struct ext4_group_desc *bg, __u32 count) | ||
152 | { | ||
153 | bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); | ||
154 | if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) | ||
155 | bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); | ||
156 | } | ||
157 | |||
158 | void ext4_free_inodes_set(struct super_block *sb, | ||
159 | struct ext4_group_desc *bg, __u32 count) | ||
160 | { | ||
161 | bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); | ||
162 | if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) | ||
163 | bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); | ||
164 | } | ||
165 | |||
166 | void ext4_used_dirs_set(struct super_block *sb, | ||
167 | struct ext4_group_desc *bg, __u32 count) | ||
168 | { | ||
169 | bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); | ||
170 | if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) | ||
171 | bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); | ||
172 | } | ||
173 | |||
174 | void ext4_itable_unused_set(struct super_block *sb, | ||
175 | struct ext4_group_desc *bg, __u32 count) | ||
176 | { | ||
177 | bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); | ||
178 | if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) | ||
179 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); | ||
180 | } | ||
181 | |||
120 | /* | 182 | /* |
121 | * Wrappers for jbd2_journal_start/end. | 183 | * Wrappers for jbd2_journal_start/end. |
122 | * | 184 | * |
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) | |||
136 | * backs (eg. EIO in the commit thread), then we still need to | 198 | * backs (eg. EIO in the commit thread), then we still need to |
137 | * take the FS itself readonly cleanly. */ | 199 | * take the FS itself readonly cleanly. */ |
138 | journal = EXT4_SB(sb)->s_journal; | 200 | journal = EXT4_SB(sb)->s_journal; |
139 | if (is_journal_aborted(journal)) { | 201 | if (journal) { |
140 | ext4_abort(sb, __func__, | 202 | if (is_journal_aborted(journal)) { |
141 | "Detected aborted journal"); | 203 | ext4_abort(sb, __func__, |
142 | return ERR_PTR(-EROFS); | 204 | "Detected aborted journal"); |
205 | return ERR_PTR(-EROFS); | ||
206 | } | ||
207 | return jbd2_journal_start(journal, nblocks); | ||
143 | } | 208 | } |
144 | 209 | /* | |
145 | return jbd2_journal_start(journal, nblocks); | 210 | * We're not journaling, return the appropriate indication. |
211 | */ | ||
212 | current->journal_info = EXT4_NOJOURNAL_HANDLE; | ||
213 | return current->journal_info; | ||
146 | } | 214 | } |
147 | 215 | ||
148 | /* | 216 | /* |
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle) | |||
157 | int err; | 225 | int err; |
158 | int rc; | 226 | int rc; |
159 | 227 | ||
228 | if (!ext4_handle_valid(handle)) { | ||
229 | /* | ||
230 | * Do this here since we don't call jbd2_journal_stop() in | ||
231 | * no-journal mode. | ||
232 | */ | ||
233 | current->journal_info = NULL; | ||
234 | return 0; | ||
235 | } | ||
160 | sb = handle->h_transaction->t_journal->j_private; | 236 | sb = handle->h_transaction->t_journal->j_private; |
161 | err = handle->h_err; | 237 | err = handle->h_err; |
162 | rc = jbd2_journal_stop(handle); | 238 | rc = jbd2_journal_stop(handle); |
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, | |||
174 | char nbuf[16]; | 250 | char nbuf[16]; |
175 | const char *errstr = ext4_decode_error(NULL, err, nbuf); | 251 | const char *errstr = ext4_decode_error(NULL, err, nbuf); |
176 | 252 | ||
253 | BUG_ON(!ext4_handle_valid(handle)); | ||
254 | |||
177 | if (bh) | 255 | if (bh) |
178 | BUFFER_TRACE(bh, "abort"); | 256 | BUFFER_TRACE(bh, "abort"); |
179 | 257 | ||
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function, | |||
350 | va_end(args); | 428 | va_end(args); |
351 | } | 429 | } |
352 | 430 | ||
431 | void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, | ||
432 | const char *function, const char *fmt, ...) | ||
433 | __releases(bitlock) | ||
434 | __acquires(bitlock) | ||
435 | { | ||
436 | va_list args; | ||
437 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | ||
438 | |||
439 | va_start(args, fmt); | ||
440 | printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); | ||
441 | vprintk(fmt, args); | ||
442 | printk("\n"); | ||
443 | va_end(args); | ||
444 | |||
445 | if (test_opt(sb, ERRORS_CONT)) { | ||
446 | EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; | ||
447 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); | ||
448 | ext4_commit_super(sb, es, 0); | ||
449 | return; | ||
450 | } | ||
451 | ext4_unlock_group(sb, grp); | ||
452 | ext4_handle_error(sb); | ||
453 | /* | ||
454 | * We only get here in the ERRORS_RO case; relocking the group | ||
455 | * may be dangerous, but nothing bad will happen since the | ||
456 | * filesystem will have already been marked read/only and the | ||
457 | * journal has been aborted. We return 1 as a hint to callers | ||
458 | * who might what to use the return value from | ||
459 | * ext4_grp_locked_error() to distinguish beween the | ||
460 | * ERRORS_CONT and ERRORS_RO case, and perhaps return more | ||
461 | * aggressively from the ext4 function in question, with a | ||
462 | * more appropriate error code. | ||
463 | */ | ||
464 | ext4_lock_group(sb, grp); | ||
465 | return; | ||
466 | } | ||
467 | |||
468 | |||
353 | void ext4_update_dynamic_rev(struct super_block *sb) | 469 | void ext4_update_dynamic_rev(struct super_block *sb) |
354 | { | 470 | { |
355 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | 471 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev) | |||
389 | return bdev; | 505 | return bdev; |
390 | 506 | ||
391 | fail: | 507 | fail: |
392 | printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", | 508 | printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", |
393 | __bdevname(dev, b), PTR_ERR(bdev)); | 509 | __bdevname(dev, b), PTR_ERR(bdev)); |
394 | return NULL; | 510 | return NULL; |
395 | } | 511 | } |
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb) | |||
448 | ext4_mb_release(sb); | 564 | ext4_mb_release(sb); |
449 | ext4_ext_release(sb); | 565 | ext4_ext_release(sb); |
450 | ext4_xattr_put_super(sb); | 566 | ext4_xattr_put_super(sb); |
451 | err = jbd2_journal_destroy(sbi->s_journal); | 567 | if (sbi->s_journal) { |
452 | sbi->s_journal = NULL; | 568 | err = jbd2_journal_destroy(sbi->s_journal); |
453 | if (err < 0) | 569 | sbi->s_journal = NULL; |
454 | ext4_abort(sb, __func__, "Couldn't clean up the journal"); | 570 | if (err < 0) |
455 | 571 | ext4_abort(sb, __func__, | |
572 | "Couldn't clean up the journal"); | ||
573 | } | ||
456 | if (!(sb->s_flags & MS_RDONLY)) { | 574 | if (!(sb->s_flags & MS_RDONLY)) { |
457 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | 575 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); |
458 | es->s_state = cpu_to_le16(sbi->s_mount_state); | 576 | es->s_state = cpu_to_le16(sbi->s_mount_state); |
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
522 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); | 640 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); |
523 | INIT_LIST_HEAD(&ei->i_prealloc_list); | 641 | INIT_LIST_HEAD(&ei->i_prealloc_list); |
524 | spin_lock_init(&ei->i_prealloc_lock); | 642 | spin_lock_init(&ei->i_prealloc_lock); |
643 | /* | ||
644 | * Note: We can be called before EXT4_SB(sb)->s_journal is set, | ||
645 | * therefore it can be null here. Don't check it, just initialize | ||
646 | * jinode. | ||
647 | */ | ||
525 | jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); | 648 | jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); |
526 | ei->i_reserved_data_blocks = 0; | 649 | ei->i_reserved_data_blocks = 0; |
527 | ei->i_reserved_meta_blocks = 0; | 650 | ei->i_reserved_meta_blocks = 0; |
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode) | |||
588 | } | 711 | } |
589 | #endif | 712 | #endif |
590 | ext4_discard_preallocations(inode); | 713 | ext4_discard_preallocations(inode); |
591 | jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, | 714 | if (EXT4_JOURNAL(inode)) |
715 | jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, | ||
592 | &EXT4_I(inode)->jinode); | 716 | &EXT4_I(inode)->jinode); |
593 | } | 717 | } |
594 | 718 | ||
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
681 | #endif | 805 | #endif |
682 | if (!test_opt(sb, RESERVATION)) | 806 | if (!test_opt(sb, RESERVATION)) |
683 | seq_puts(seq, ",noreservation"); | 807 | seq_puts(seq, ",noreservation"); |
684 | if (sbi->s_commit_interval) { | 808 | if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { |
685 | seq_printf(seq, ",commit=%u", | 809 | seq_printf(seq, ",commit=%u", |
686 | (unsigned) (sbi->s_commit_interval / HZ)); | 810 | (unsigned) (sbi->s_commit_interval / HZ)); |
687 | } | 811 | } |
812 | if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { | ||
813 | seq_printf(seq, ",min_batch_time=%u", | ||
814 | (unsigned) sbi->s_min_batch_time); | ||
815 | } | ||
816 | if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { | ||
817 | seq_printf(seq, ",max_batch_time=%u", | ||
818 | (unsigned) sbi->s_min_batch_time); | ||
819 | } | ||
820 | |||
688 | /* | 821 | /* |
689 | * We're changing the default of barrier mount option, so | 822 | * We're changing the default of barrier mount option, so |
690 | * let's always display its mount state so it's clear what its | 823 | * let's always display its mount state so it's clear what its |
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
696 | seq_puts(seq, ",journal_async_commit"); | 829 | seq_puts(seq, ",journal_async_commit"); |
697 | if (test_opt(sb, NOBH)) | 830 | if (test_opt(sb, NOBH)) |
698 | seq_puts(seq, ",nobh"); | 831 | seq_puts(seq, ",nobh"); |
699 | if (!test_opt(sb, EXTENTS)) | ||
700 | seq_puts(seq, ",noextents"); | ||
701 | if (test_opt(sb, I_VERSION)) | 832 | if (test_opt(sb, I_VERSION)) |
702 | seq_puts(seq, ",i_version"); | 833 | seq_puts(seq, ",i_version"); |
703 | if (!test_opt(sb, DELALLOC)) | 834 | if (!test_opt(sb, DELALLOC)) |
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, | |||
772 | ext4_nfs_get_inode); | 903 | ext4_nfs_get_inode); |
773 | } | 904 | } |
774 | 905 | ||
906 | /* | ||
907 | * Try to release metadata pages (indirect blocks, directories) which are | ||
908 | * mapped via the block device. Since these pages could have journal heads | ||
909 | * which would prevent try_to_free_buffers() from freeing them, we must use | ||
910 | * jbd2 layer's try_to_free_buffers() function to release them. | ||
911 | */ | ||
912 | static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) | ||
913 | { | ||
914 | journal_t *journal = EXT4_SB(sb)->s_journal; | ||
915 | |||
916 | WARN_ON(PageChecked(page)); | ||
917 | if (!page_has_buffers(page)) | ||
918 | return 0; | ||
919 | if (journal) | ||
920 | return jbd2_journal_try_to_free_buffers(journal, page, | ||
921 | wait & ~__GFP_WAIT); | ||
922 | return try_to_free_buffers(page); | ||
923 | } | ||
924 | |||
775 | #ifdef CONFIG_QUOTA | 925 | #ifdef CONFIG_QUOTA |
776 | #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") | 926 | #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") |
777 | #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) | 927 | #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) |
@@ -828,8 +978,8 @@ static const struct super_operations ext4_sops = { | |||
828 | .put_super = ext4_put_super, | 978 | .put_super = ext4_put_super, |
829 | .write_super = ext4_write_super, | 979 | .write_super = ext4_write_super, |
830 | .sync_fs = ext4_sync_fs, | 980 | .sync_fs = ext4_sync_fs, |
831 | .write_super_lockfs = ext4_write_super_lockfs, | 981 | .freeze_fs = ext4_freeze, |
832 | .unlockfs = ext4_unlockfs, | 982 | .unfreeze_fs = ext4_unfreeze, |
833 | .statfs = ext4_statfs, | 983 | .statfs = ext4_statfs, |
834 | .remount_fs = ext4_remount, | 984 | .remount_fs = ext4_remount, |
835 | .clear_inode = ext4_clear_inode, | 985 | .clear_inode = ext4_clear_inode, |
@@ -838,6 +988,7 @@ static const struct super_operations ext4_sops = { | |||
838 | .quota_read = ext4_quota_read, | 988 | .quota_read = ext4_quota_read, |
839 | .quota_write = ext4_quota_write, | 989 | .quota_write = ext4_quota_write, |
840 | #endif | 990 | #endif |
991 | .bdev_try_to_free_page = bdev_try_to_free_page, | ||
841 | }; | 992 | }; |
842 | 993 | ||
843 | static const struct export_operations ext4_export_ops = { | 994 | static const struct export_operations ext4_export_ops = { |
@@ -852,16 +1003,17 @@ enum { | |||
852 | Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, | 1003 | Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, |
853 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, | 1004 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, |
854 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, | 1005 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, |
855 | Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, | 1006 | Opt_commit, Opt_min_batch_time, Opt_max_batch_time, |
1007 | Opt_journal_update, Opt_journal_dev, | ||
856 | Opt_journal_checksum, Opt_journal_async_commit, | 1008 | Opt_journal_checksum, Opt_journal_async_commit, |
857 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, | 1009 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, |
858 | Opt_data_err_abort, Opt_data_err_ignore, | 1010 | Opt_data_err_abort, Opt_data_err_ignore, |
859 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, | 1011 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, |
860 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | 1012 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, |
861 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | 1013 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, |
862 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, | 1014 | Opt_grpquota, Opt_i_version, |
863 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, | 1015 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, |
864 | Opt_inode_readahead_blks | 1016 | Opt_inode_readahead_blks, Opt_journal_ioprio |
865 | }; | 1017 | }; |
866 | 1018 | ||
867 | static const match_table_t tokens = { | 1019 | static const match_table_t tokens = { |
@@ -891,8 +1043,9 @@ static const match_table_t tokens = { | |||
891 | {Opt_nobh, "nobh"}, | 1043 | {Opt_nobh, "nobh"}, |
892 | {Opt_bh, "bh"}, | 1044 | {Opt_bh, "bh"}, |
893 | {Opt_commit, "commit=%u"}, | 1045 | {Opt_commit, "commit=%u"}, |
1046 | {Opt_min_batch_time, "min_batch_time=%u"}, | ||
1047 | {Opt_max_batch_time, "max_batch_time=%u"}, | ||
894 | {Opt_journal_update, "journal=update"}, | 1048 | {Opt_journal_update, "journal=update"}, |
895 | {Opt_journal_inum, "journal=%u"}, | ||
896 | {Opt_journal_dev, "journal_dev=%u"}, | 1049 | {Opt_journal_dev, "journal_dev=%u"}, |
897 | {Opt_journal_checksum, "journal_checksum"}, | 1050 | {Opt_journal_checksum, "journal_checksum"}, |
898 | {Opt_journal_async_commit, "journal_async_commit"}, | 1051 | {Opt_journal_async_commit, "journal_async_commit"}, |
@@ -913,14 +1066,13 @@ static const match_table_t tokens = { | |||
913 | {Opt_quota, "quota"}, | 1066 | {Opt_quota, "quota"}, |
914 | {Opt_usrquota, "usrquota"}, | 1067 | {Opt_usrquota, "usrquota"}, |
915 | {Opt_barrier, "barrier=%u"}, | 1068 | {Opt_barrier, "barrier=%u"}, |
916 | {Opt_extents, "extents"}, | ||
917 | {Opt_noextents, "noextents"}, | ||
918 | {Opt_i_version, "i_version"}, | 1069 | {Opt_i_version, "i_version"}, |
919 | {Opt_stripe, "stripe=%u"}, | 1070 | {Opt_stripe, "stripe=%u"}, |
920 | {Opt_resize, "resize"}, | 1071 | {Opt_resize, "resize"}, |
921 | {Opt_delalloc, "delalloc"}, | 1072 | {Opt_delalloc, "delalloc"}, |
922 | {Opt_nodelalloc, "nodelalloc"}, | 1073 | {Opt_nodelalloc, "nodelalloc"}, |
923 | {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, | 1074 | {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, |
1075 | {Opt_journal_ioprio, "journal_ioprio=%u"}, | ||
924 | {Opt_err, NULL}, | 1076 | {Opt_err, NULL}, |
925 | }; | 1077 | }; |
926 | 1078 | ||
@@ -945,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data) | |||
945 | return sb_block; | 1097 | return sb_block; |
946 | } | 1098 | } |
947 | 1099 | ||
1100 | #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) | ||
1101 | |||
948 | static int parse_options(char *options, struct super_block *sb, | 1102 | static int parse_options(char *options, struct super_block *sb, |
949 | unsigned int *inum, unsigned long *journal_devnum, | 1103 | unsigned long *journal_devnum, |
1104 | unsigned int *journal_ioprio, | ||
950 | ext4_fsblk_t *n_blocks_count, int is_remount) | 1105 | ext4_fsblk_t *n_blocks_count, int is_remount) |
951 | { | 1106 | { |
952 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1107 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
@@ -958,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb, | |||
958 | int qtype, qfmt; | 1113 | int qtype, qfmt; |
959 | char *qname; | 1114 | char *qname; |
960 | #endif | 1115 | #endif |
961 | ext4_fsblk_t last_block; | ||
962 | 1116 | ||
963 | if (!options) | 1117 | if (!options) |
964 | return 1; | 1118 | return 1; |
@@ -1070,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb, | |||
1070 | } | 1224 | } |
1071 | set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); | 1225 | set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); |
1072 | break; | 1226 | break; |
1073 | case Opt_journal_inum: | ||
1074 | if (is_remount) { | ||
1075 | printk(KERN_ERR "EXT4-fs: cannot specify " | ||
1076 | "journal on remount\n"); | ||
1077 | return 0; | ||
1078 | } | ||
1079 | if (match_int(&args[0], &option)) | ||
1080 | return 0; | ||
1081 | *inum = option; | ||
1082 | break; | ||
1083 | case Opt_journal_dev: | 1227 | case Opt_journal_dev: |
1084 | if (is_remount) { | 1228 | if (is_remount) { |
1085 | printk(KERN_ERR "EXT4-fs: cannot specify " | 1229 | printk(KERN_ERR "EXT4-fs: cannot specify " |
@@ -1109,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb, | |||
1109 | option = JBD2_DEFAULT_MAX_COMMIT_AGE; | 1253 | option = JBD2_DEFAULT_MAX_COMMIT_AGE; |
1110 | sbi->s_commit_interval = HZ * option; | 1254 | sbi->s_commit_interval = HZ * option; |
1111 | break; | 1255 | break; |
1256 | case Opt_max_batch_time: | ||
1257 | if (match_int(&args[0], &option)) | ||
1258 | return 0; | ||
1259 | if (option < 0) | ||
1260 | return 0; | ||
1261 | if (option == 0) | ||
1262 | option = EXT4_DEF_MAX_BATCH_TIME; | ||
1263 | sbi->s_max_batch_time = option; | ||
1264 | break; | ||
1265 | case Opt_min_batch_time: | ||
1266 | if (match_int(&args[0], &option)) | ||
1267 | return 0; | ||
1268 | if (option < 0) | ||
1269 | return 0; | ||
1270 | sbi->s_min_batch_time = option; | ||
1271 | break; | ||
1112 | case Opt_data_journal: | 1272 | case Opt_data_journal: |
1113 | data_opt = EXT4_MOUNT_JOURNAL_DATA; | 1273 | data_opt = EXT4_MOUNT_JOURNAL_DATA; |
1114 | goto datacheck; | 1274 | goto datacheck; |
@@ -1279,33 +1439,6 @@ set_qf_format: | |||
1279 | case Opt_bh: | 1439 | case Opt_bh: |
1280 | clear_opt(sbi->s_mount_opt, NOBH); | 1440 | clear_opt(sbi->s_mount_opt, NOBH); |
1281 | break; | 1441 | break; |
1282 | case Opt_extents: | ||
1283 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, | ||
1284 | EXT4_FEATURE_INCOMPAT_EXTENTS)) { | ||
1285 | ext4_warning(sb, __func__, | ||
1286 | "extents feature not enabled " | ||
1287 | "on this filesystem, use tune2fs\n"); | ||
1288 | return 0; | ||
1289 | } | ||
1290 | set_opt(sbi->s_mount_opt, EXTENTS); | ||
1291 | break; | ||
1292 | case Opt_noextents: | ||
1293 | /* | ||
1294 | * When e2fsprogs support resizing an already existing | ||
1295 | * ext3 file system to greater than 2**32 we need to | ||
1296 | * add support to block allocator to handle growing | ||
1297 | * already existing block mapped inode so that blocks | ||
1298 | * allocated for them fall within 2**32 | ||
1299 | */ | ||
1300 | last_block = ext4_blocks_count(sbi->s_es) - 1; | ||
1301 | if (last_block > 0xffffffffULL) { | ||
1302 | printk(KERN_ERR "EXT4-fs: Filesystem too " | ||
1303 | "large to mount with " | ||
1304 | "-o noextents options\n"); | ||
1305 | return 0; | ||
1306 | } | ||
1307 | clear_opt(sbi->s_mount_opt, EXTENTS); | ||
1308 | break; | ||
1309 | case Opt_i_version: | 1442 | case Opt_i_version: |
1310 | set_opt(sbi->s_mount_opt, I_VERSION); | 1443 | set_opt(sbi->s_mount_opt, I_VERSION); |
1311 | sb->s_flags |= MS_I_VERSION; | 1444 | sb->s_flags |= MS_I_VERSION; |
@@ -1330,6 +1463,14 @@ set_qf_format: | |||
1330 | return 0; | 1463 | return 0; |
1331 | sbi->s_inode_readahead_blks = option; | 1464 | sbi->s_inode_readahead_blks = option; |
1332 | break; | 1465 | break; |
1466 | case Opt_journal_ioprio: | ||
1467 | if (match_int(&args[0], &option)) | ||
1468 | return 0; | ||
1469 | if (option < 0 || option > 7) | ||
1470 | break; | ||
1471 | *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, | ||
1472 | option); | ||
1473 | break; | ||
1333 | default: | 1474 | default: |
1334 | printk(KERN_ERR | 1475 | printk(KERN_ERR |
1335 | "EXT4-fs: Unrecognized mount option \"%s\" " | 1476 | "EXT4-fs: Unrecognized mount option \"%s\" " |
@@ -1405,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |||
1405 | printk(KERN_WARNING | 1546 | printk(KERN_WARNING |
1406 | "EXT4-fs warning: checktime reached, " | 1547 | "EXT4-fs warning: checktime reached, " |
1407 | "running e2fsck is recommended\n"); | 1548 | "running e2fsck is recommended\n"); |
1408 | #if 0 | 1549 | if (!sbi->s_journal) |
1409 | /* @@@ We _will_ want to clear the valid bit if we find | 1550 | es->s_state &= cpu_to_le16(~EXT4_VALID_FS); |
1410 | * inconsistencies, to force a fsck at reboot. But for | ||
1411 | * a plain journaled filesystem we can keep it set as | ||
1412 | * valid forever! :) | ||
1413 | */ | ||
1414 | es->s_state &= cpu_to_le16(~EXT4_VALID_FS); | ||
1415 | #endif | ||
1416 | if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) | 1551 | if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) |
1417 | es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); | 1552 | es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); |
1418 | le16_add_cpu(&es->s_mnt_count, 1); | 1553 | le16_add_cpu(&es->s_mnt_count, 1); |
1419 | es->s_mtime = cpu_to_le32(get_seconds()); | 1554 | es->s_mtime = cpu_to_le32(get_seconds()); |
1420 | ext4_update_dynamic_rev(sb); | 1555 | ext4_update_dynamic_rev(sb); |
1421 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | 1556 | if (sbi->s_journal) |
1557 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | ||
1422 | 1558 | ||
1423 | ext4_commit_super(sb, es, 1); | 1559 | ext4_commit_super(sb, es, 1); |
1424 | if (test_opt(sb, DEBUG)) | 1560 | if (test_opt(sb, DEBUG)) |
1425 | printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " | 1561 | printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " |
1426 | "bpg=%lu, ipg=%lu, mo=%04lx]\n", | 1562 | "bpg=%lu, ipg=%lu, mo=%04lx]\n", |
1427 | sb->s_blocksize, | 1563 | sb->s_blocksize, |
1428 | sbi->s_groups_count, | 1564 | sbi->s_groups_count, |
@@ -1430,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |||
1430 | EXT4_INODES_PER_GROUP(sb), | 1566 | EXT4_INODES_PER_GROUP(sb), |
1431 | sbi->s_mount_opt); | 1567 | sbi->s_mount_opt); |
1432 | 1568 | ||
1433 | printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", | 1569 | if (EXT4_SB(sb)->s_journal) { |
1434 | sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : | 1570 | printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", |
1435 | "external", EXT4_SB(sb)->s_journal->j_devname); | 1571 | sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : |
1572 | "external", EXT4_SB(sb)->s_journal->j_devname); | ||
1573 | } else { | ||
1574 | printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); | ||
1575 | } | ||
1436 | return res; | 1576 | return res; |
1437 | } | 1577 | } |
1438 | 1578 | ||
@@ -1444,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1444 | ext4_group_t flex_group_count; | 1584 | ext4_group_t flex_group_count; |
1445 | ext4_group_t flex_group; | 1585 | ext4_group_t flex_group; |
1446 | int groups_per_flex = 0; | 1586 | int groups_per_flex = 0; |
1447 | __u64 block_bitmap = 0; | ||
1448 | int i; | 1587 | int i; |
1449 | 1588 | ||
1450 | if (!sbi->s_es->s_log_groups_per_flex) { | 1589 | if (!sbi->s_es->s_log_groups_per_flex) { |
@@ -1463,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1463 | sizeof(struct flex_groups), GFP_KERNEL); | 1602 | sizeof(struct flex_groups), GFP_KERNEL); |
1464 | if (sbi->s_flex_groups == NULL) { | 1603 | if (sbi->s_flex_groups == NULL) { |
1465 | printk(KERN_ERR "EXT4-fs: not enough memory for " | 1604 | printk(KERN_ERR "EXT4-fs: not enough memory for " |
1466 | "%lu flex groups\n", flex_group_count); | 1605 | "%u flex groups\n", flex_group_count); |
1467 | goto failed; | 1606 | goto failed; |
1468 | } | 1607 | } |
1469 | 1608 | ||
1470 | gdp = ext4_get_group_desc(sb, 1, &bh); | ||
1471 | block_bitmap = ext4_block_bitmap(sb, gdp) - 1; | ||
1472 | |||
1473 | for (i = 0; i < sbi->s_groups_count; i++) { | 1609 | for (i = 0; i < sbi->s_groups_count; i++) { |
1474 | gdp = ext4_get_group_desc(sb, i, &bh); | 1610 | gdp = ext4_get_group_desc(sb, i, &bh); |
1475 | 1611 | ||
1476 | flex_group = ext4_flex_group(sbi, i); | 1612 | flex_group = ext4_flex_group(sbi, i); |
1477 | sbi->s_flex_groups[flex_group].free_inodes += | 1613 | sbi->s_flex_groups[flex_group].free_inodes += |
1478 | le16_to_cpu(gdp->bg_free_inodes_count); | 1614 | ext4_free_inodes_count(sb, gdp); |
1479 | sbi->s_flex_groups[flex_group].free_blocks += | 1615 | sbi->s_flex_groups[flex_group].free_blocks += |
1480 | le16_to_cpu(gdp->bg_free_blocks_count); | 1616 | ext4_free_blks_count(sb, gdp); |
1481 | } | 1617 | } |
1482 | 1618 | ||
1483 | return 1; | 1619 | return 1; |
@@ -1551,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb) | |||
1551 | block_bitmap = ext4_block_bitmap(sb, gdp); | 1687 | block_bitmap = ext4_block_bitmap(sb, gdp); |
1552 | if (block_bitmap < first_block || block_bitmap > last_block) { | 1688 | if (block_bitmap < first_block || block_bitmap > last_block) { |
1553 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | 1689 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " |
1554 | "Block bitmap for group %lu not in group " | 1690 | "Block bitmap for group %u not in group " |
1555 | "(block %llu)!\n", i, block_bitmap); | 1691 | "(block %llu)!\n", i, block_bitmap); |
1556 | return 0; | 1692 | return 0; |
1557 | } | 1693 | } |
1558 | inode_bitmap = ext4_inode_bitmap(sb, gdp); | 1694 | inode_bitmap = ext4_inode_bitmap(sb, gdp); |
1559 | if (inode_bitmap < first_block || inode_bitmap > last_block) { | 1695 | if (inode_bitmap < first_block || inode_bitmap > last_block) { |
1560 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | 1696 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " |
1561 | "Inode bitmap for group %lu not in group " | 1697 | "Inode bitmap for group %u not in group " |
1562 | "(block %llu)!\n", i, inode_bitmap); | 1698 | "(block %llu)!\n", i, inode_bitmap); |
1563 | return 0; | 1699 | return 0; |
1564 | } | 1700 | } |
@@ -1566,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb) | |||
1566 | if (inode_table < first_block || | 1702 | if (inode_table < first_block || |
1567 | inode_table + sbi->s_itb_per_group - 1 > last_block) { | 1703 | inode_table + sbi->s_itb_per_group - 1 > last_block) { |
1568 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | 1704 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " |
1569 | "Inode table for group %lu not in group " | 1705 | "Inode table for group %u not in group " |
1570 | "(block %llu)!\n", i, inode_table); | 1706 | "(block %llu)!\n", i, inode_table); |
1571 | return 0; | 1707 | return 0; |
1572 | } | 1708 | } |
1573 | spin_lock(sb_bgl_lock(sbi, i)); | 1709 | spin_lock(sb_bgl_lock(sbi, i)); |
1574 | if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { | 1710 | if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { |
1575 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | 1711 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " |
1576 | "Checksum for group %lu failed (%u!=%u)\n", | 1712 | "Checksum for group %u failed (%u!=%u)\n", |
1577 | i, le16_to_cpu(ext4_group_desc_csum(sbi, i, | 1713 | i, le16_to_cpu(ext4_group_desc_csum(sbi, i, |
1578 | gdp)), le16_to_cpu(gdp->bg_checksum)); | 1714 | gdp)), le16_to_cpu(gdp->bg_checksum)); |
1579 | if (!(sb->s_flags & MS_RDONLY)) { | 1715 | if (!(sb->s_flags & MS_RDONLY)) { |
@@ -1865,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
1865 | ext4_fsblk_t sb_block = get_sb_block(&data); | 2001 | ext4_fsblk_t sb_block = get_sb_block(&data); |
1866 | ext4_fsblk_t logical_sb_block; | 2002 | ext4_fsblk_t logical_sb_block; |
1867 | unsigned long offset = 0; | 2003 | unsigned long offset = 0; |
1868 | unsigned int journal_inum = 0; | ||
1869 | unsigned long journal_devnum = 0; | 2004 | unsigned long journal_devnum = 0; |
1870 | unsigned long def_mount_opts; | 2005 | unsigned long def_mount_opts; |
1871 | struct inode *root; | 2006 | struct inode *root; |
1872 | char *cp; | 2007 | char *cp; |
2008 | const char *descr; | ||
1873 | int ret = -EINVAL; | 2009 | int ret = -EINVAL; |
1874 | int blocksize; | 2010 | int blocksize; |
1875 | int db_count; | 2011 | unsigned int db_count; |
1876 | int i; | 2012 | unsigned int i; |
1877 | int needs_recovery, has_huge_files; | 2013 | int needs_recovery, has_huge_files; |
1878 | __le32 features; | 2014 | int features; |
1879 | __u64 blocks_count; | 2015 | __u64 blocks_count; |
1880 | int err; | 2016 | int err; |
2017 | unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; | ||
1881 | 2018 | ||
1882 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | 2019 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); |
1883 | if (!sbi) | 2020 | if (!sbi) |
@@ -1958,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
1958 | 2095 | ||
1959 | sbi->s_resuid = le16_to_cpu(es->s_def_resuid); | 2096 | sbi->s_resuid = le16_to_cpu(es->s_def_resuid); |
1960 | sbi->s_resgid = le16_to_cpu(es->s_def_resgid); | 2097 | sbi->s_resgid = le16_to_cpu(es->s_def_resgid); |
2098 | sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; | ||
2099 | sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; | ||
2100 | sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; | ||
1961 | 2101 | ||
1962 | set_opt(sbi->s_mount_opt, RESERVATION); | 2102 | set_opt(sbi->s_mount_opt, RESERVATION); |
1963 | set_opt(sbi->s_mount_opt, BARRIER); | 2103 | set_opt(sbi->s_mount_opt, BARRIER); |
1964 | 2104 | ||
1965 | /* | 2105 | /* |
1966 | * turn on extents feature by default in ext4 filesystem | ||
1967 | * only if feature flag already set by mkfs or tune2fs. | ||
1968 | * Use -o noextents to turn it off | ||
1969 | */ | ||
1970 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) | ||
1971 | set_opt(sbi->s_mount_opt, EXTENTS); | ||
1972 | else | ||
1973 | ext4_warning(sb, __func__, | ||
1974 | "extents feature not enabled on this filesystem, " | ||
1975 | "use tune2fs.\n"); | ||
1976 | |||
1977 | /* | ||
1978 | * enable delayed allocation by default | 2106 | * enable delayed allocation by default |
1979 | * Use -o nodelalloc to turn it off | 2107 | * Use -o nodelalloc to turn it off |
1980 | */ | 2108 | */ |
1981 | set_opt(sbi->s_mount_opt, DELALLOC); | 2109 | set_opt(sbi->s_mount_opt, DELALLOC); |
1982 | 2110 | ||
1983 | 2111 | ||
1984 | if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, | 2112 | if (!parse_options((char *) data, sb, &journal_devnum, |
1985 | NULL, 0)) | 2113 | &journal_ioprio, NULL, 0)) |
1986 | goto failed_mount; | 2114 | goto failed_mount; |
1987 | 2115 | ||
1988 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | | 2116 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | |
@@ -2004,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2004 | features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); | 2132 | features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); |
2005 | if (features) { | 2133 | if (features) { |
2006 | printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " | 2134 | printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " |
2007 | "unsupported optional features (%x).\n", | 2135 | "unsupported optional features (%x).\n", sb->s_id, |
2008 | sb->s_id, le32_to_cpu(features)); | 2136 | (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & |
2137 | ~EXT4_FEATURE_INCOMPAT_SUPP)); | ||
2009 | goto failed_mount; | 2138 | goto failed_mount; |
2010 | } | 2139 | } |
2011 | features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); | 2140 | features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); |
2012 | if (!(sb->s_flags & MS_RDONLY) && features) { | 2141 | if (!(sb->s_flags & MS_RDONLY) && features) { |
2013 | printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " | 2142 | printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " |
2014 | "unsupported optional features (%x).\n", | 2143 | "unsupported optional features (%x).\n", sb->s_id, |
2015 | sb->s_id, le32_to_cpu(features)); | 2144 | (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & |
2145 | ~EXT4_FEATURE_RO_COMPAT_SUPP)); | ||
2016 | goto failed_mount; | 2146 | goto failed_mount; |
2017 | } | 2147 | } |
2018 | has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, | 2148 | has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, |
@@ -2117,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2117 | for (i = 0; i < 4; i++) | 2247 | for (i = 0; i < 4; i++) |
2118 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); | 2248 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); |
2119 | sbi->s_def_hash_version = es->s_def_hash_version; | 2249 | sbi->s_def_hash_version = es->s_def_hash_version; |
2250 | i = le32_to_cpu(es->s_flags); | ||
2251 | if (i & EXT2_FLAGS_UNSIGNED_HASH) | ||
2252 | sbi->s_hash_unsigned = 3; | ||
2253 | else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { | ||
2254 | #ifdef __CHAR_UNSIGNED__ | ||
2255 | es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); | ||
2256 | sbi->s_hash_unsigned = 3; | ||
2257 | #else | ||
2258 | es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); | ||
2259 | #endif | ||
2260 | sb->s_dirt = 1; | ||
2261 | } | ||
2120 | 2262 | ||
2121 | if (sbi->s_blocks_per_group > blocksize * 8) { | 2263 | if (sbi->s_blocks_per_group > blocksize * 8) { |
2122 | printk(KERN_ERR | 2264 | printk(KERN_ERR |
@@ -2144,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2144 | if (EXT4_BLOCKS_PER_GROUP(sb) == 0) | 2286 | if (EXT4_BLOCKS_PER_GROUP(sb) == 0) |
2145 | goto cantfind_ext4; | 2287 | goto cantfind_ext4; |
2146 | 2288 | ||
2147 | /* ensure blocks_count calculation below doesn't sign-extend */ | 2289 | /* |
2148 | if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < | 2290 | * It makes no sense for the first data block to be beyond the end |
2149 | le32_to_cpu(es->s_first_data_block) + 1) { | 2291 | * of the filesystem. |
2150 | printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " | 2292 | */ |
2151 | "first data block %u, blocks per group %lu\n", | 2293 | if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { |
2152 | ext4_blocks_count(es), | 2294 | printk(KERN_WARNING "EXT4-fs: bad geometry: first data" |
2153 | le32_to_cpu(es->s_first_data_block), | 2295 | "block %u is beyond end of filesystem (%llu)\n", |
2154 | EXT4_BLOCKS_PER_GROUP(sb)); | 2296 | le32_to_cpu(es->s_first_data_block), |
2297 | ext4_blocks_count(es)); | ||
2155 | goto failed_mount; | 2298 | goto failed_mount; |
2156 | } | 2299 | } |
2157 | blocks_count = (ext4_blocks_count(es) - | 2300 | blocks_count = (ext4_blocks_count(es) - |
2158 | le32_to_cpu(es->s_first_data_block) + | 2301 | le32_to_cpu(es->s_first_data_block) + |
2159 | EXT4_BLOCKS_PER_GROUP(sb) - 1); | 2302 | EXT4_BLOCKS_PER_GROUP(sb) - 1); |
2160 | do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); | 2303 | do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); |
2304 | if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { | ||
2305 | printk(KERN_WARNING "EXT4-fs: groups count too large: %u " | ||
2306 | "(block count %llu, first data block %u, " | ||
2307 | "blocks per group %lu)\n", sbi->s_groups_count, | ||
2308 | ext4_blocks_count(es), | ||
2309 | le32_to_cpu(es->s_first_data_block), | ||
2310 | EXT4_BLOCKS_PER_GROUP(sb)); | ||
2311 | goto failed_mount; | ||
2312 | } | ||
2161 | sbi->s_groups_count = blocks_count; | 2313 | sbi->s_groups_count = blocks_count; |
2162 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | 2314 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / |
2163 | EXT4_DESC_PER_BLOCK(sb); | 2315 | EXT4_DESC_PER_BLOCK(sb); |
@@ -2269,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2269 | EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; | 2421 | EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; |
2270 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); | 2422 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); |
2271 | ext4_commit_super(sb, es, 1); | 2423 | ext4_commit_super(sb, es, 1); |
2272 | printk(KERN_CRIT | ||
2273 | "EXT4-fs (device %s): mount failed\n", | ||
2274 | sb->s_id); | ||
2275 | goto failed_mount4; | 2424 | goto failed_mount4; |
2276 | } | 2425 | } |
2277 | } | 2426 | } |
2278 | } else if (journal_inum) { | 2427 | } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && |
2279 | if (ext4_create_journal(sb, es, journal_inum)) | 2428 | EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { |
2280 | goto failed_mount3; | 2429 | printk(KERN_ERR "EXT4-fs: required journal recovery " |
2430 | "suppressed and not mounted read-only\n"); | ||
2431 | goto failed_mount4; | ||
2281 | } else { | 2432 | } else { |
2282 | if (!silent) | 2433 | clear_opt(sbi->s_mount_opt, DATA_FLAGS); |
2283 | printk(KERN_ERR | 2434 | set_opt(sbi->s_mount_opt, WRITEBACK_DATA); |
2284 | "ext4: No journal on filesystem on %s\n", | 2435 | sbi->s_journal = NULL; |
2285 | sb->s_id); | 2436 | needs_recovery = 0; |
2286 | goto failed_mount3; | 2437 | goto no_journal; |
2287 | } | 2438 | } |
2288 | 2439 | ||
2289 | if (ext4_blocks_count(es) > 0xffffffffULL && | 2440 | if (ext4_blocks_count(es) > 0xffffffffULL && |
2290 | !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, | 2441 | !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, |
2291 | JBD2_FEATURE_INCOMPAT_64BIT)) { | 2442 | JBD2_FEATURE_INCOMPAT_64BIT)) { |
2292 | printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); | 2443 | printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); |
2293 | goto failed_mount4; | 2444 | goto failed_mount4; |
2294 | } | 2445 | } |
2295 | 2446 | ||
@@ -2334,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2334 | default: | 2485 | default: |
2335 | break; | 2486 | break; |
2336 | } | 2487 | } |
2488 | set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); | ||
2489 | |||
2490 | no_journal: | ||
2337 | 2491 | ||
2338 | if (test_opt(sb, NOBH)) { | 2492 | if (test_opt(sb, NOBH)) { |
2339 | if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { | 2493 | if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { |
@@ -2419,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2419 | EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; | 2573 | EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; |
2420 | ext4_orphan_cleanup(sb, es); | 2574 | ext4_orphan_cleanup(sb, es); |
2421 | EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; | 2575 | EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; |
2422 | if (needs_recovery) | 2576 | if (needs_recovery) { |
2423 | printk(KERN_INFO "EXT4-fs: recovery complete.\n"); | 2577 | printk(KERN_INFO "EXT4-fs: recovery complete.\n"); |
2424 | ext4_mark_recovery_complete(sb, es); | 2578 | ext4_mark_recovery_complete(sb, es); |
2425 | printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", | 2579 | } |
2426 | test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": | 2580 | if (EXT4_SB(sb)->s_journal) { |
2427 | test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": | 2581 | if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) |
2428 | "writeback"); | 2582 | descr = " journalled data mode"; |
2583 | else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) | ||
2584 | descr = " ordered data mode"; | ||
2585 | else | ||
2586 | descr = " writeback data mode"; | ||
2587 | } else | ||
2588 | descr = "out journal"; | ||
2589 | |||
2590 | printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", | ||
2591 | sb->s_id, descr); | ||
2429 | 2592 | ||
2430 | lock_kernel(); | 2593 | lock_kernel(); |
2431 | return 0; | 2594 | return 0; |
@@ -2437,8 +2600,11 @@ cantfind_ext4: | |||
2437 | goto failed_mount; | 2600 | goto failed_mount; |
2438 | 2601 | ||
2439 | failed_mount4: | 2602 | failed_mount4: |
2440 | jbd2_journal_destroy(sbi->s_journal); | 2603 | printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); |
2441 | sbi->s_journal = NULL; | 2604 | if (sbi->s_journal) { |
2605 | jbd2_journal_destroy(sbi->s_journal); | ||
2606 | sbi->s_journal = NULL; | ||
2607 | } | ||
2442 | failed_mount3: | 2608 | failed_mount3: |
2443 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 2609 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
2444 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 2610 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
@@ -2475,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) | |||
2475 | { | 2641 | { |
2476 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2642 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2477 | 2643 | ||
2478 | if (sbi->s_commit_interval) | 2644 | journal->j_commit_interval = sbi->s_commit_interval; |
2479 | journal->j_commit_interval = sbi->s_commit_interval; | 2645 | journal->j_min_batch_time = sbi->s_min_batch_time; |
2480 | /* We could also set up an ext4-specific default for the commit | 2646 | journal->j_max_batch_time = sbi->s_max_batch_time; |
2481 | * interval here, but for now we'll just fall back to the jbd | ||
2482 | * default. */ | ||
2483 | 2647 | ||
2484 | spin_lock(&journal->j_state_lock); | 2648 | spin_lock(&journal->j_state_lock); |
2485 | if (test_opt(sb, BARRIER)) | 2649 | if (test_opt(sb, BARRIER)) |
@@ -2499,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, | |||
2499 | struct inode *journal_inode; | 2663 | struct inode *journal_inode; |
2500 | journal_t *journal; | 2664 | journal_t *journal; |
2501 | 2665 | ||
2666 | BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); | ||
2667 | |||
2502 | /* First, test for the existence of a valid inode on disk. Bad | 2668 | /* First, test for the existence of a valid inode on disk. Bad |
2503 | * things happen if we iget() an unused inode, as the subsequent | 2669 | * things happen if we iget() an unused inode, as the subsequent |
2504 | * iput() will try to delete it. */ | 2670 | * iput() will try to delete it. */ |
@@ -2547,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, | |||
2547 | struct ext4_super_block *es; | 2713 | struct ext4_super_block *es; |
2548 | struct block_device *bdev; | 2714 | struct block_device *bdev; |
2549 | 2715 | ||
2716 | BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); | ||
2717 | |||
2550 | bdev = ext4_blkdev_get(j_dev); | 2718 | bdev = ext4_blkdev_get(j_dev); |
2551 | if (bdev == NULL) | 2719 | if (bdev == NULL) |
2552 | return NULL; | 2720 | return NULL; |
2553 | 2721 | ||
2554 | if (bd_claim(bdev, sb)) { | 2722 | if (bd_claim(bdev, sb)) { |
2555 | printk(KERN_ERR | 2723 | printk(KERN_ERR |
2556 | "EXT4: failed to claim external journal device.\n"); | 2724 | "EXT4-fs: failed to claim external journal device.\n"); |
2557 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); | 2725 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); |
2558 | return NULL; | 2726 | return NULL; |
2559 | } | 2727 | } |
@@ -2634,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb, | |||
2634 | int err = 0; | 2802 | int err = 0; |
2635 | int really_read_only; | 2803 | int really_read_only; |
2636 | 2804 | ||
2805 | BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); | ||
2806 | |||
2637 | if (journal_devnum && | 2807 | if (journal_devnum && |
2638 | journal_devnum != le32_to_cpu(es->s_journal_dev)) { | 2808 | journal_devnum != le32_to_cpu(es->s_journal_dev)) { |
2639 | printk(KERN_INFO "EXT4-fs: external journal device major/minor " | 2809 | printk(KERN_INFO "EXT4-fs: external journal device major/minor " |
@@ -2718,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb, | |||
2718 | return 0; | 2888 | return 0; |
2719 | } | 2889 | } |
2720 | 2890 | ||
2721 | static int ext4_create_journal(struct super_block *sb, | 2891 | static int ext4_commit_super(struct super_block *sb, |
2722 | struct ext4_super_block *es, | ||
2723 | unsigned int journal_inum) | ||
2724 | { | ||
2725 | journal_t *journal; | ||
2726 | int err; | ||
2727 | |||
2728 | if (sb->s_flags & MS_RDONLY) { | ||
2729 | printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " | ||
2730 | "create journal.\n"); | ||
2731 | return -EROFS; | ||
2732 | } | ||
2733 | |||
2734 | journal = ext4_get_journal(sb, journal_inum); | ||
2735 | if (!journal) | ||
2736 | return -EINVAL; | ||
2737 | |||
2738 | printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", | ||
2739 | journal_inum); | ||
2740 | |||
2741 | err = jbd2_journal_create(journal); | ||
2742 | if (err) { | ||
2743 | printk(KERN_ERR "EXT4-fs: error creating journal.\n"); | ||
2744 | jbd2_journal_destroy(journal); | ||
2745 | return -EIO; | ||
2746 | } | ||
2747 | |||
2748 | EXT4_SB(sb)->s_journal = journal; | ||
2749 | |||
2750 | ext4_update_dynamic_rev(sb); | ||
2751 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | ||
2752 | EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); | ||
2753 | |||
2754 | es->s_journal_inum = cpu_to_le32(journal_inum); | ||
2755 | sb->s_dirt = 1; | ||
2756 | |||
2757 | /* Make sure we flush the recovery flag to disk. */ | ||
2758 | ext4_commit_super(sb, es, 1); | ||
2759 | |||
2760 | return 0; | ||
2761 | } | ||
2762 | |||
2763 | static void ext4_commit_super(struct super_block *sb, | ||
2764 | struct ext4_super_block *es, int sync) | 2892 | struct ext4_super_block *es, int sync) |
2765 | { | 2893 | { |
2766 | struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; | 2894 | struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; |
2895 | int error = 0; | ||
2767 | 2896 | ||
2768 | if (!sbh) | 2897 | if (!sbh) |
2769 | return; | 2898 | return error; |
2770 | if (buffer_write_io_error(sbh)) { | 2899 | if (buffer_write_io_error(sbh)) { |
2771 | /* | 2900 | /* |
2772 | * Oh, dear. A previous attempt to write the | 2901 | * Oh, dear. A previous attempt to write the |
@@ -2776,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb, | |||
2776 | * be remapped. Nothing we can do but to retry the | 2905 | * be remapped. Nothing we can do but to retry the |
2777 | * write and hope for the best. | 2906 | * write and hope for the best. |
2778 | */ | 2907 | */ |
2779 | printk(KERN_ERR "ext4: previous I/O error to " | 2908 | printk(KERN_ERR "EXT4-fs: previous I/O error to " |
2780 | "superblock detected for %s.\n", sb->s_id); | 2909 | "superblock detected for %s.\n", sb->s_id); |
2781 | clear_buffer_write_io_error(sbh); | 2910 | clear_buffer_write_io_error(sbh); |
2782 | set_buffer_uptodate(sbh); | 2911 | set_buffer_uptodate(sbh); |
2783 | } | 2912 | } |
2784 | es->s_wtime = cpu_to_le32(get_seconds()); | 2913 | es->s_wtime = cpu_to_le32(get_seconds()); |
2785 | ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); | 2914 | ext4_free_blocks_count_set(es, percpu_counter_sum_positive( |
2786 | es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); | 2915 | &EXT4_SB(sb)->s_freeblocks_counter)); |
2916 | es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( | ||
2917 | &EXT4_SB(sb)->s_freeinodes_counter)); | ||
2918 | |||
2787 | BUFFER_TRACE(sbh, "marking dirty"); | 2919 | BUFFER_TRACE(sbh, "marking dirty"); |
2788 | mark_buffer_dirty(sbh); | 2920 | mark_buffer_dirty(sbh); |
2789 | if (sync) { | 2921 | if (sync) { |
2790 | sync_dirty_buffer(sbh); | 2922 | error = sync_dirty_buffer(sbh); |
2791 | if (buffer_write_io_error(sbh)) { | 2923 | if (error) |
2792 | printk(KERN_ERR "ext4: I/O error while writing " | 2924 | return error; |
2925 | |||
2926 | error = buffer_write_io_error(sbh); | ||
2927 | if (error) { | ||
2928 | printk(KERN_ERR "EXT4-fs: I/O error while writing " | ||
2793 | "superblock for %s.\n", sb->s_id); | 2929 | "superblock for %s.\n", sb->s_id); |
2794 | clear_buffer_write_io_error(sbh); | 2930 | clear_buffer_write_io_error(sbh); |
2795 | set_buffer_uptodate(sbh); | 2931 | set_buffer_uptodate(sbh); |
2796 | } | 2932 | } |
2797 | } | 2933 | } |
2934 | return error; | ||
2798 | } | 2935 | } |
2799 | 2936 | ||
2800 | 2937 | ||
@@ -2808,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb, | |||
2808 | { | 2945 | { |
2809 | journal_t *journal = EXT4_SB(sb)->s_journal; | 2946 | journal_t *journal = EXT4_SB(sb)->s_journal; |
2810 | 2947 | ||
2948 | if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { | ||
2949 | BUG_ON(journal != NULL); | ||
2950 | return; | ||
2951 | } | ||
2811 | jbd2_journal_lock_updates(journal); | 2952 | jbd2_journal_lock_updates(journal); |
2812 | if (jbd2_journal_flush(journal) < 0) | 2953 | if (jbd2_journal_flush(journal) < 0) |
2813 | goto out; | 2954 | goto out; |
@@ -2837,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb, | |||
2837 | int j_errno; | 2978 | int j_errno; |
2838 | const char *errstr; | 2979 | const char *errstr; |
2839 | 2980 | ||
2981 | BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); | ||
2982 | |||
2840 | journal = EXT4_SB(sb)->s_journal; | 2983 | journal = EXT4_SB(sb)->s_journal; |
2841 | 2984 | ||
2842 | /* | 2985 | /* |
@@ -2869,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb, | |||
2869 | int ext4_force_commit(struct super_block *sb) | 3012 | int ext4_force_commit(struct super_block *sb) |
2870 | { | 3013 | { |
2871 | journal_t *journal; | 3014 | journal_t *journal; |
2872 | int ret; | 3015 | int ret = 0; |
2873 | 3016 | ||
2874 | if (sb->s_flags & MS_RDONLY) | 3017 | if (sb->s_flags & MS_RDONLY) |
2875 | return 0; | 3018 | return 0; |
2876 | 3019 | ||
2877 | journal = EXT4_SB(sb)->s_journal; | 3020 | journal = EXT4_SB(sb)->s_journal; |
2878 | sb->s_dirt = 0; | 3021 | if (journal) { |
2879 | ret = ext4_journal_force_commit(journal); | 3022 | sb->s_dirt = 0; |
3023 | ret = ext4_journal_force_commit(journal); | ||
3024 | } | ||
3025 | |||
2880 | return ret; | 3026 | return ret; |
2881 | } | 3027 | } |
2882 | 3028 | ||
@@ -2888,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb) | |||
2888 | */ | 3034 | */ |
2889 | static void ext4_write_super(struct super_block *sb) | 3035 | static void ext4_write_super(struct super_block *sb) |
2890 | { | 3036 | { |
2891 | if (mutex_trylock(&sb->s_lock) != 0) | 3037 | if (EXT4_SB(sb)->s_journal) { |
2892 | BUG(); | 3038 | if (mutex_trylock(&sb->s_lock) != 0) |
2893 | sb->s_dirt = 0; | 3039 | BUG(); |
3040 | sb->s_dirt = 0; | ||
3041 | } else { | ||
3042 | ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); | ||
3043 | } | ||
2894 | } | 3044 | } |
2895 | 3045 | ||
2896 | static int ext4_sync_fs(struct super_block *sb, int wait) | 3046 | static int ext4_sync_fs(struct super_block *sb, int wait) |
@@ -2899,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) | |||
2899 | 3049 | ||
2900 | trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); | 3050 | trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); |
2901 | sb->s_dirt = 0; | 3051 | sb->s_dirt = 0; |
2902 | if (wait) | 3052 | if (EXT4_SB(sb)->s_journal) { |
2903 | ret = ext4_force_commit(sb); | 3053 | if (wait) |
2904 | else | 3054 | ret = ext4_force_commit(sb); |
2905 | jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); | 3055 | else |
3056 | jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); | ||
3057 | } else { | ||
3058 | ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); | ||
3059 | } | ||
2906 | return ret; | 3060 | return ret; |
2907 | } | 3061 | } |
2908 | 3062 | ||
@@ -2910,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait) | |||
2910 | * LVM calls this function before a (read-only) snapshot is created. This | 3064 | * LVM calls this function before a (read-only) snapshot is created. This |
2911 | * gives us a chance to flush the journal completely and mark the fs clean. | 3065 | * gives us a chance to flush the journal completely and mark the fs clean. |
2912 | */ | 3066 | */ |
2913 | static void ext4_write_super_lockfs(struct super_block *sb) | 3067 | static int ext4_freeze(struct super_block *sb) |
2914 | { | 3068 | { |
3069 | int error = 0; | ||
3070 | journal_t *journal; | ||
2915 | sb->s_dirt = 0; | 3071 | sb->s_dirt = 0; |
2916 | 3072 | ||
2917 | if (!(sb->s_flags & MS_RDONLY)) { | 3073 | if (!(sb->s_flags & MS_RDONLY)) { |
2918 | journal_t *journal = EXT4_SB(sb)->s_journal; | 3074 | journal = EXT4_SB(sb)->s_journal; |
2919 | 3075 | ||
2920 | /* Now we set up the journal barrier. */ | 3076 | if (journal) { |
2921 | jbd2_journal_lock_updates(journal); | 3077 | /* Now we set up the journal barrier. */ |
3078 | jbd2_journal_lock_updates(journal); | ||
2922 | 3079 | ||
2923 | /* | 3080 | /* |
2924 | * We don't want to clear needs_recovery flag when we failed | 3081 | * We don't want to clear needs_recovery flag when we |
2925 | * to flush the journal. | 3082 | * failed to flush the journal. |
2926 | */ | 3083 | */ |
2927 | if (jbd2_journal_flush(journal) < 0) | 3084 | error = jbd2_journal_flush(journal); |
2928 | return; | 3085 | if (error < 0) |
3086 | goto out; | ||
3087 | } | ||
2929 | 3088 | ||
2930 | /* Journal blocked and flushed, clear needs_recovery flag. */ | 3089 | /* Journal blocked and flushed, clear needs_recovery flag. */ |
2931 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | 3090 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); |
2932 | ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); | 3091 | ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); |
3092 | error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); | ||
3093 | if (error) | ||
3094 | goto out; | ||
2933 | } | 3095 | } |
3096 | return 0; | ||
3097 | out: | ||
3098 | jbd2_journal_unlock_updates(journal); | ||
3099 | return error; | ||
2934 | } | 3100 | } |
2935 | 3101 | ||
2936 | /* | 3102 | /* |
2937 | * Called by LVM after the snapshot is done. We need to reset the RECOVER | 3103 | * Called by LVM after the snapshot is done. We need to reset the RECOVER |
2938 | * flag here, even though the filesystem is not technically dirty yet. | 3104 | * flag here, even though the filesystem is not technically dirty yet. |
2939 | */ | 3105 | */ |
2940 | static void ext4_unlockfs(struct super_block *sb) | 3106 | static int ext4_unfreeze(struct super_block *sb) |
2941 | { | 3107 | { |
2942 | if (!(sb->s_flags & MS_RDONLY)) { | 3108 | if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { |
2943 | lock_super(sb); | 3109 | lock_super(sb); |
2944 | /* Reser the needs_recovery flag before the fs is unlocked. */ | 3110 | /* Reser the needs_recovery flag before the fs is unlocked. */ |
2945 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | 3111 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); |
@@ -2947,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb) | |||
2947 | unlock_super(sb); | 3113 | unlock_super(sb); |
2948 | jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); | 3114 | jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); |
2949 | } | 3115 | } |
3116 | return 0; | ||
2950 | } | 3117 | } |
2951 | 3118 | ||
2952 | static int ext4_remount(struct super_block *sb, int *flags, char *data) | 3119 | static int ext4_remount(struct super_block *sb, int *flags, char *data) |
@@ -2957,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
2957 | unsigned long old_sb_flags; | 3124 | unsigned long old_sb_flags; |
2958 | struct ext4_mount_options old_opts; | 3125 | struct ext4_mount_options old_opts; |
2959 | ext4_group_t g; | 3126 | ext4_group_t g; |
3127 | unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; | ||
2960 | int err; | 3128 | int err; |
2961 | #ifdef CONFIG_QUOTA | 3129 | #ifdef CONFIG_QUOTA |
2962 | int i; | 3130 | int i; |
@@ -2968,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
2968 | old_opts.s_resuid = sbi->s_resuid; | 3136 | old_opts.s_resuid = sbi->s_resuid; |
2969 | old_opts.s_resgid = sbi->s_resgid; | 3137 | old_opts.s_resgid = sbi->s_resgid; |
2970 | old_opts.s_commit_interval = sbi->s_commit_interval; | 3138 | old_opts.s_commit_interval = sbi->s_commit_interval; |
3139 | old_opts.s_min_batch_time = sbi->s_min_batch_time; | ||
3140 | old_opts.s_max_batch_time = sbi->s_max_batch_time; | ||
2971 | #ifdef CONFIG_QUOTA | 3141 | #ifdef CONFIG_QUOTA |
2972 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; | 3142 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; |
2973 | for (i = 0; i < MAXQUOTAS; i++) | 3143 | for (i = 0; i < MAXQUOTAS; i++) |
2974 | old_opts.s_qf_names[i] = sbi->s_qf_names[i]; | 3144 | old_opts.s_qf_names[i] = sbi->s_qf_names[i]; |
2975 | #endif | 3145 | #endif |
3146 | if (sbi->s_journal && sbi->s_journal->j_task->io_context) | ||
3147 | journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; | ||
2976 | 3148 | ||
2977 | /* | 3149 | /* |
2978 | * Allow the "check" option to be passed as a remount option. | 3150 | * Allow the "check" option to be passed as a remount option. |
2979 | */ | 3151 | */ |
2980 | if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { | 3152 | if (!parse_options(data, sb, NULL, &journal_ioprio, |
3153 | &n_blocks_count, 1)) { | ||
2981 | err = -EINVAL; | 3154 | err = -EINVAL; |
2982 | goto restore_opts; | 3155 | goto restore_opts; |
2983 | } | 3156 | } |
@@ -2990,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
2990 | 3163 | ||
2991 | es = sbi->s_es; | 3164 | es = sbi->s_es; |
2992 | 3165 | ||
2993 | ext4_init_journal_params(sb, sbi->s_journal); | 3166 | if (sbi->s_journal) { |
3167 | ext4_init_journal_params(sb, sbi->s_journal); | ||
3168 | set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); | ||
3169 | } | ||
2994 | 3170 | ||
2995 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || | 3171 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || |
2996 | n_blocks_count > ext4_blocks_count(es)) { | 3172 | n_blocks_count > ext4_blocks_count(es)) { |
@@ -3019,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
3019 | * We have to unlock super so that we can wait for | 3195 | * We have to unlock super so that we can wait for |
3020 | * transactions. | 3196 | * transactions. |
3021 | */ | 3197 | */ |
3022 | unlock_super(sb); | 3198 | if (sbi->s_journal) { |
3023 | ext4_mark_recovery_complete(sb, es); | 3199 | unlock_super(sb); |
3024 | lock_super(sb); | 3200 | ext4_mark_recovery_complete(sb, es); |
3201 | lock_super(sb); | ||
3202 | } | ||
3025 | } else { | 3203 | } else { |
3026 | __le32 ret; | 3204 | int ret; |
3027 | if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, | 3205 | if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, |
3028 | ~EXT4_FEATURE_RO_COMPAT_SUPP))) { | 3206 | ~EXT4_FEATURE_RO_COMPAT_SUPP))) { |
3029 | printk(KERN_WARNING "EXT4-fs: %s: couldn't " | 3207 | printk(KERN_WARNING "EXT4-fs: %s: couldn't " |
3030 | "remount RDWR because of unsupported " | 3208 | "remount RDWR because of unsupported " |
3031 | "optional features (%x).\n", | 3209 | "optional features (%x).\n", sb->s_id, |
3032 | sb->s_id, le32_to_cpu(ret)); | 3210 | (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & |
3211 | ~EXT4_FEATURE_RO_COMPAT_SUPP)); | ||
3033 | err = -EROFS; | 3212 | err = -EROFS; |
3034 | goto restore_opts; | 3213 | goto restore_opts; |
3035 | } | 3214 | } |
@@ -3046,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
3046 | if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { | 3225 | if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { |
3047 | printk(KERN_ERR | 3226 | printk(KERN_ERR |
3048 | "EXT4-fs: ext4_remount: " | 3227 | "EXT4-fs: ext4_remount: " |
3049 | "Checksum for group %lu failed (%u!=%u)\n", | 3228 | "Checksum for group %u failed (%u!=%u)\n", |
3050 | g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), | 3229 | g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), |
3051 | le16_to_cpu(gdp->bg_checksum)); | 3230 | le16_to_cpu(gdp->bg_checksum)); |
3052 | err = -EINVAL; | 3231 | err = -EINVAL; |
@@ -3075,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
3075 | * been changed by e2fsck since we originally mounted | 3254 | * been changed by e2fsck since we originally mounted |
3076 | * the partition.) | 3255 | * the partition.) |
3077 | */ | 3256 | */ |
3078 | ext4_clear_journal_err(sb, es); | 3257 | if (sbi->s_journal) |
3258 | ext4_clear_journal_err(sb, es); | ||
3079 | sbi->s_mount_state = le16_to_cpu(es->s_state); | 3259 | sbi->s_mount_state = le16_to_cpu(es->s_state); |
3080 | if ((err = ext4_group_extend(sb, es, n_blocks_count))) | 3260 | if ((err = ext4_group_extend(sb, es, n_blocks_count))) |
3081 | goto restore_opts; | 3261 | goto restore_opts; |
@@ -3083,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
3083 | sb->s_flags &= ~MS_RDONLY; | 3263 | sb->s_flags &= ~MS_RDONLY; |
3084 | } | 3264 | } |
3085 | } | 3265 | } |
3266 | if (sbi->s_journal == NULL) | ||
3267 | ext4_commit_super(sb, es, 1); | ||
3268 | |||
3086 | #ifdef CONFIG_QUOTA | 3269 | #ifdef CONFIG_QUOTA |
3087 | /* Release old quota file names */ | 3270 | /* Release old quota file names */ |
3088 | for (i = 0; i < MAXQUOTAS; i++) | 3271 | for (i = 0; i < MAXQUOTAS; i++) |
@@ -3097,6 +3280,8 @@ restore_opts: | |||
3097 | sbi->s_resuid = old_opts.s_resuid; | 3280 | sbi->s_resuid = old_opts.s_resuid; |
3098 | sbi->s_resgid = old_opts.s_resgid; | 3281 | sbi->s_resgid = old_opts.s_resgid; |
3099 | sbi->s_commit_interval = old_opts.s_commit_interval; | 3282 | sbi->s_commit_interval = old_opts.s_commit_interval; |
3283 | sbi->s_min_batch_time = old_opts.s_min_batch_time; | ||
3284 | sbi->s_max_batch_time = old_opts.s_max_batch_time; | ||
3100 | #ifdef CONFIG_QUOTA | 3285 | #ifdef CONFIG_QUOTA |
3101 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; | 3286 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; |
3102 | for (i = 0; i < MAXQUOTAS; i++) { | 3287 | for (i = 0; i < MAXQUOTAS; i++) { |
@@ -3359,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, | |||
3359 | * When we journal data on quota file, we have to flush journal to see | 3544 | * When we journal data on quota file, we have to flush journal to see |
3360 | * all updates to the file when we bypass pagecache... | 3545 | * all updates to the file when we bypass pagecache... |
3361 | */ | 3546 | */ |
3362 | if (ext4_should_journal_data(path.dentry->d_inode)) { | 3547 | if (EXT4_SB(sb)->s_journal && |
3548 | ext4_should_journal_data(path.dentry->d_inode)) { | ||
3363 | /* | 3549 | /* |
3364 | * We don't need to lock updates but journal_flush() could | 3550 | * We don't need to lock updates but journal_flush() could |
3365 | * otherwise be livelocked... | 3551 | * otherwise be livelocked... |
@@ -3433,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |||
3433 | struct buffer_head *bh; | 3619 | struct buffer_head *bh; |
3434 | handle_t *handle = journal_current_handle(); | 3620 | handle_t *handle = journal_current_handle(); |
3435 | 3621 | ||
3436 | if (!handle) { | 3622 | if (EXT4_SB(sb)->s_journal && !handle) { |
3437 | printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" | 3623 | printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" |
3438 | " cancelled because transaction is not started.\n", | 3624 | " cancelled because transaction is not started.\n", |
3439 | (unsigned long long)off, (unsigned long long)len); | 3625 | (unsigned long long)off, (unsigned long long)len); |
@@ -3458,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |||
3458 | flush_dcache_page(bh->b_page); | 3644 | flush_dcache_page(bh->b_page); |
3459 | unlock_buffer(bh); | 3645 | unlock_buffer(bh); |
3460 | if (journal_quota) | 3646 | if (journal_quota) |
3461 | err = ext4_journal_dirty_metadata(handle, bh); | 3647 | err = ext4_handle_dirty_metadata(handle, NULL, bh); |
3462 | else { | 3648 | else { |
3463 | /* Always do at least ordered writes for quotas */ | 3649 | /* Always do at least ordered writes for quotas */ |
3464 | err = ext4_jbd2_file_inode(handle, inode); | 3650 | err = ext4_jbd2_file_inode(handle, inode); |
@@ -3512,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file) | |||
3512 | static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, | 3698 | static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, |
3513 | size_t cnt, loff_t *ppos) | 3699 | size_t cnt, loff_t *ppos) |
3514 | { | 3700 | { |
3515 | unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; | 3701 | unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; |
3516 | char str[32]; | 3702 | char str[32]; |
3517 | unsigned long value; | ||
3518 | 3703 | ||
3519 | if (cnt >= sizeof(str)) | 3704 | if (cnt >= sizeof(str)) |
3520 | return -EINVAL; | 3705 | return -EINVAL; |
3521 | if (copy_from_user(str, buf, cnt)) | 3706 | if (copy_from_user(str, buf, cnt)) |
3522 | return -EFAULT; | 3707 | return -EFAULT; |
3523 | value = simple_strtol(str, NULL, 0); | 3708 | |
3524 | if (value < 0) | 3709 | *p = simple_strtoul(str, NULL, 0); |
3525 | return -ERANGE; | ||
3526 | *p = value; | ||
3527 | return cnt; | 3710 | return cnt; |
3528 | } | 3711 | } |
3529 | 3712 | ||
@@ -3614,7 +3797,7 @@ static void __exit exit_ext4_fs(void) | |||
3614 | } | 3797 | } |
3615 | 3798 | ||
3616 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); | 3799 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); |
3617 | MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); | 3800 | MODULE_DESCRIPTION("Fourth Extended Filesystem"); |
3618 | MODULE_LICENSE("GPL"); | 3801 | MODULE_LICENSE("GPL"); |
3619 | module_init(init_ext4_fs) | 3802 | module_init(init_ext4_fs) |
3620 | module_exit(exit_ext4_fs) | 3803 | module_exit(exit_ext4_fs) |
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 80626d516fee..157ce6589c54 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, | |||
457 | if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { | 457 | if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { |
458 | EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); | 458 | EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); |
459 | sb->s_dirt = 1; | 459 | sb->s_dirt = 1; |
460 | ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | 460 | ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
461 | } | 461 | } |
462 | } | 462 | } |
463 | 463 | ||
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, | |||
487 | ext4_forget(handle, 1, inode, bh, bh->b_blocknr); | 487 | ext4_forget(handle, 1, inode, bh, bh->b_blocknr); |
488 | } else { | 488 | } else { |
489 | le32_add_cpu(&BHDR(bh)->h_refcount, -1); | 489 | le32_add_cpu(&BHDR(bh)->h_refcount, -1); |
490 | error = ext4_journal_dirty_metadata(handle, bh); | 490 | error = ext4_handle_dirty_metadata(handle, inode, bh); |
491 | if (IS_SYNC(inode)) | 491 | if (IS_SYNC(inode)) |
492 | handle->h_sync = 1; | 492 | ext4_handle_sync(handle); |
493 | DQUOT_FREE_BLOCK(inode, 1); | 493 | DQUOT_FREE_BLOCK(inode, 1); |
494 | ea_bdebug(bh, "refcount now=%d; releasing", | 494 | ea_bdebug(bh, "refcount now=%d; releasing", |
495 | le32_to_cpu(BHDR(bh)->h_refcount)); | 495 | le32_to_cpu(BHDR(bh)->h_refcount)); |
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, | |||
724 | if (error == -EIO) | 724 | if (error == -EIO) |
725 | goto bad_block; | 725 | goto bad_block; |
726 | if (!error) | 726 | if (!error) |
727 | error = ext4_journal_dirty_metadata(handle, | 727 | error = ext4_handle_dirty_metadata(handle, |
728 | bs->bh); | 728 | inode, |
729 | bs->bh); | ||
729 | if (error) | 730 | if (error) |
730 | goto cleanup; | 731 | goto cleanup; |
731 | goto inserted; | 732 | goto inserted; |
@@ -794,8 +795,9 @@ inserted: | |||
794 | ea_bdebug(new_bh, "reusing; refcount now=%d", | 795 | ea_bdebug(new_bh, "reusing; refcount now=%d", |
795 | le32_to_cpu(BHDR(new_bh)->h_refcount)); | 796 | le32_to_cpu(BHDR(new_bh)->h_refcount)); |
796 | unlock_buffer(new_bh); | 797 | unlock_buffer(new_bh); |
797 | error = ext4_journal_dirty_metadata(handle, | 798 | error = ext4_handle_dirty_metadata(handle, |
798 | new_bh); | 799 | inode, |
800 | new_bh); | ||
799 | if (error) | 801 | if (error) |
800 | goto cleanup_dquot; | 802 | goto cleanup_dquot; |
801 | } | 803 | } |
@@ -810,8 +812,8 @@ inserted: | |||
810 | /* We need to allocate a new block */ | 812 | /* We need to allocate a new block */ |
811 | ext4_fsblk_t goal = ext4_group_first_block_no(sb, | 813 | ext4_fsblk_t goal = ext4_group_first_block_no(sb, |
812 | EXT4_I(inode)->i_block_group); | 814 | EXT4_I(inode)->i_block_group); |
813 | ext4_fsblk_t block = ext4_new_meta_block(handle, inode, | 815 | ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, |
814 | goal, &error); | 816 | goal, NULL, &error); |
815 | if (error) | 817 | if (error) |
816 | goto cleanup; | 818 | goto cleanup; |
817 | ea_idebug(inode, "creating block %d", block); | 819 | ea_idebug(inode, "creating block %d", block); |
@@ -833,7 +835,8 @@ getblk_failed: | |||
833 | set_buffer_uptodate(new_bh); | 835 | set_buffer_uptodate(new_bh); |
834 | unlock_buffer(new_bh); | 836 | unlock_buffer(new_bh); |
835 | ext4_xattr_cache_insert(new_bh); | 837 | ext4_xattr_cache_insert(new_bh); |
836 | error = ext4_journal_dirty_metadata(handle, new_bh); | 838 | error = ext4_handle_dirty_metadata(handle, |
839 | inode, new_bh); | ||
837 | if (error) | 840 | if (error) |
838 | goto cleanup; | 841 | goto cleanup; |
839 | } | 842 | } |
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, | |||
1040 | */ | 1043 | */ |
1041 | is.iloc.bh = NULL; | 1044 | is.iloc.bh = NULL; |
1042 | if (IS_SYNC(inode)) | 1045 | if (IS_SYNC(inode)) |
1043 | handle->h_sync = 1; | 1046 | ext4_handle_sync(handle); |
1044 | } | 1047 | } |
1045 | 1048 | ||
1046 | cleanup: | 1049 | cleanup: |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0ff0b8cf309..e5eaa62fd17f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
421 | * If we're a pdlfush thread, then implement pdflush collision avoidance | 421 | * If we're a pdlfush thread, then implement pdflush collision avoidance |
422 | * against the entire list. | 422 | * against the entire list. |
423 | * | 423 | * |
424 | * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so | ||
425 | * that it can be located for waiting on in __writeback_single_inode(). | ||
426 | * | ||
427 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | 424 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. |
428 | * This function assumes that the blockdev superblock's inodes are backed by | 425 | * This function assumes that the blockdev superblock's inodes are backed by |
429 | * a variety of queues, so all inodes are searched. For other superblocks, | 426 | * a variety of queues, so all inodes are searched. For other superblocks, |
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb, | |||
443 | struct writeback_control *wbc) | 440 | struct writeback_control *wbc) |
444 | { | 441 | { |
445 | const unsigned long start = jiffies; /* livelock avoidance */ | 442 | const unsigned long start = jiffies; /* livelock avoidance */ |
443 | int sync = wbc->sync_mode == WB_SYNC_ALL; | ||
446 | 444 | ||
447 | spin_lock(&inode_lock); | 445 | spin_lock(&inode_lock); |
448 | if (!wbc->for_kupdate || list_empty(&sb->s_io)) | 446 | if (!wbc->for_kupdate || list_empty(&sb->s_io)) |
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb, | |||
499 | __iget(inode); | 497 | __iget(inode); |
500 | pages_skipped = wbc->pages_skipped; | 498 | pages_skipped = wbc->pages_skipped; |
501 | __writeback_single_inode(inode, wbc); | 499 | __writeback_single_inode(inode, wbc); |
502 | if (wbc->sync_mode == WB_SYNC_HOLD) { | ||
503 | inode->dirtied_when = jiffies; | ||
504 | list_move(&inode->i_list, &sb->s_dirty); | ||
505 | } | ||
506 | if (current_is_pdflush()) | 500 | if (current_is_pdflush()) |
507 | writeback_release(bdi); | 501 | writeback_release(bdi); |
508 | if (wbc->pages_skipped != pages_skipped) { | 502 | if (wbc->pages_skipped != pages_skipped) { |
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb, | |||
523 | if (!list_empty(&sb->s_more_io)) | 517 | if (!list_empty(&sb->s_more_io)) |
524 | wbc->more_io = 1; | 518 | wbc->more_io = 1; |
525 | } | 519 | } |
526 | spin_unlock(&inode_lock); | 520 | |
521 | if (sync) { | ||
522 | struct inode *inode, *old_inode = NULL; | ||
523 | |||
524 | /* | ||
525 | * Data integrity sync. Must wait for all pages under writeback, | ||
526 | * because there may have been pages dirtied before our sync | ||
527 | * call, but which had writeout started before we write it out. | ||
528 | * In which case, the inode may not be on the dirty list, but | ||
529 | * we still have to wait for that writeout. | ||
530 | */ | ||
531 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | ||
532 | struct address_space *mapping; | ||
533 | |||
534 | if (inode->i_state & (I_FREEING|I_WILL_FREE)) | ||
535 | continue; | ||
536 | mapping = inode->i_mapping; | ||
537 | if (mapping->nrpages == 0) | ||
538 | continue; | ||
539 | __iget(inode); | ||
540 | spin_unlock(&inode_lock); | ||
541 | /* | ||
542 | * We hold a reference to 'inode' so it couldn't have | ||
543 | * been removed from s_inodes list while we dropped the | ||
544 | * inode_lock. We cannot iput the inode now as we can | ||
545 | * be holding the last reference and we cannot iput it | ||
546 | * under inode_lock. So we keep the reference and iput | ||
547 | * it later. | ||
548 | */ | ||
549 | iput(old_inode); | ||
550 | old_inode = inode; | ||
551 | |||
552 | filemap_fdatawait(mapping); | ||
553 | |||
554 | cond_resched(); | ||
555 | |||
556 | spin_lock(&inode_lock); | ||
557 | } | ||
558 | spin_unlock(&inode_lock); | ||
559 | iput(old_inode); | ||
560 | } else | ||
561 | spin_unlock(&inode_lock); | ||
562 | |||
527 | return; /* Leave any unwritten inodes on s_io */ | 563 | return; /* Leave any unwritten inodes on s_io */ |
528 | } | 564 | } |
529 | EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); | 565 | EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); |
@@ -588,8 +624,7 @@ restart: | |||
588 | 624 | ||
589 | /* | 625 | /* |
590 | * writeback and wait upon the filesystem's dirty inodes. The caller will | 626 | * writeback and wait upon the filesystem's dirty inodes. The caller will |
591 | * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is | 627 | * do this in two passes - one to write, and one to wait. |
592 | * used to park the written inodes on sb->s_dirty for the wait pass. | ||
593 | * | 628 | * |
594 | * A finite limit is set on the number of pages which will be written. | 629 | * A finite limit is set on the number of pages which will be written. |
595 | * To prevent infinite livelock of sys_sync(). | 630 | * To prevent infinite livelock of sys_sync(). |
@@ -600,30 +635,21 @@ restart: | |||
600 | void sync_inodes_sb(struct super_block *sb, int wait) | 635 | void sync_inodes_sb(struct super_block *sb, int wait) |
601 | { | 636 | { |
602 | struct writeback_control wbc = { | 637 | struct writeback_control wbc = { |
603 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, | 638 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, |
604 | .range_start = 0, | 639 | .range_start = 0, |
605 | .range_end = LLONG_MAX, | 640 | .range_end = LLONG_MAX, |
606 | }; | 641 | }; |
607 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | ||
608 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | ||
609 | 642 | ||
610 | wbc.nr_to_write = nr_dirty + nr_unstable + | 643 | if (!wait) { |
611 | (inodes_stat.nr_inodes - inodes_stat.nr_unused) + | 644 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); |
612 | nr_dirty + nr_unstable; | 645 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); |
613 | wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ | ||
614 | sync_sb_inodes(sb, &wbc); | ||
615 | } | ||
616 | 646 | ||
617 | /* | 647 | wbc.nr_to_write = nr_dirty + nr_unstable + |
618 | * Rather lame livelock avoidance. | 648 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
619 | */ | 649 | } else |
620 | static void set_sb_syncing(int val) | 650 | wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ |
621 | { | 651 | |
622 | struct super_block *sb; | 652 | sync_sb_inodes(sb, &wbc); |
623 | spin_lock(&sb_lock); | ||
624 | list_for_each_entry_reverse(sb, &super_blocks, s_list) | ||
625 | sb->s_syncing = val; | ||
626 | spin_unlock(&sb_lock); | ||
627 | } | 653 | } |
628 | 654 | ||
629 | /** | 655 | /** |
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait) | |||
652 | spin_lock(&sb_lock); | 678 | spin_lock(&sb_lock); |
653 | restart: | 679 | restart: |
654 | list_for_each_entry(sb, &super_blocks, s_list) { | 680 | list_for_each_entry(sb, &super_blocks, s_list) { |
655 | if (sb->s_syncing) | ||
656 | continue; | ||
657 | sb->s_syncing = 1; | ||
658 | sb->s_count++; | 681 | sb->s_count++; |
659 | spin_unlock(&sb_lock); | 682 | spin_unlock(&sb_lock); |
660 | down_read(&sb->s_umount); | 683 | down_read(&sb->s_umount); |
@@ -672,13 +695,10 @@ restart: | |||
672 | 695 | ||
673 | void sync_inodes(int wait) | 696 | void sync_inodes(int wait) |
674 | { | 697 | { |
675 | set_sb_syncing(0); | ||
676 | __sync_inodes(0); | 698 | __sync_inodes(0); |
677 | 699 | ||
678 | if (wait) { | 700 | if (wait) |
679 | set_sb_syncing(0); | ||
680 | __sync_inodes(1); | 701 | __sync_inodes(1); |
681 | } | ||
682 | } | 702 | } |
683 | 703 | ||
684 | /** | 704 | /** |
diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4f3cab321415..99c99dfb0373 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, | |||
48 | size_t size; | 48 | size_t size; |
49 | 49 | ||
50 | if (!*ppos) { | 50 | if (!*ppos) { |
51 | long value; | ||
51 | struct fuse_conn *fc = fuse_ctl_file_conn_get(file); | 52 | struct fuse_conn *fc = fuse_ctl_file_conn_get(file); |
52 | if (!fc) | 53 | if (!fc) |
53 | return 0; | 54 | return 0; |
54 | 55 | ||
55 | file->private_data=(void *)(long)atomic_read(&fc->num_waiting); | 56 | value = atomic_read(&fc->num_waiting); |
57 | file->private_data = (void *)value; | ||
56 | fuse_conn_put(fc); | 58 | fuse_conn_put(fc); |
57 | } | 59 | } |
58 | size = sprintf(tmp, "%ld\n", (long)file->private_data); | 60 | size = sprintf(tmp, "%ld\n", (long)file->private_data); |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fba571648a8e..e0c7ada08a1f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc) | |||
269 | * Called with fc->lock, unlocks it | 269 | * Called with fc->lock, unlocks it |
270 | */ | 270 | */ |
271 | static void request_end(struct fuse_conn *fc, struct fuse_req *req) | 271 | static void request_end(struct fuse_conn *fc, struct fuse_req *req) |
272 | __releases(fc->lock) | 272 | __releases(&fc->lock) |
273 | { | 273 | { |
274 | void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; | 274 | void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; |
275 | req->end = NULL; | 275 | req->end = NULL; |
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) | |||
293 | wake_up(&req->waitq); | 293 | wake_up(&req->waitq); |
294 | if (end) | 294 | if (end) |
295 | end(fc, req); | 295 | end(fc, req); |
296 | else | 296 | fuse_put_request(fc, req); |
297 | fuse_put_request(fc, req); | ||
298 | } | 297 | } |
299 | 298 | ||
300 | static void wait_answer_interruptible(struct fuse_conn *fc, | 299 | static void wait_answer_interruptible(struct fuse_conn *fc, |
301 | struct fuse_req *req) | 300 | struct fuse_req *req) |
302 | __releases(fc->lock) __acquires(fc->lock) | 301 | __releases(&fc->lock) |
302 | __acquires(&fc->lock) | ||
303 | { | 303 | { |
304 | if (signal_pending(current)) | 304 | if (signal_pending(current)) |
305 | return; | 305 | return; |
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) | |||
317 | } | 317 | } |
318 | 318 | ||
319 | static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) | 319 | static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) |
320 | __releases(fc->lock) __acquires(fc->lock) | 320 | __releases(&fc->lock) |
321 | __acquires(&fc->lock) | ||
321 | { | 322 | { |
322 | if (!fc->no_interrupt) { | 323 | if (!fc->no_interrupt) { |
323 | /* Any signal may interrupt this */ | 324 | /* Any signal may interrupt this */ |
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) | |||
380 | } | 381 | } |
381 | } | 382 | } |
382 | 383 | ||
383 | void request_send(struct fuse_conn *fc, struct fuse_req *req) | 384 | void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) |
384 | { | 385 | { |
385 | req->isreply = 1; | 386 | req->isreply = 1; |
386 | spin_lock(&fc->lock); | 387 | spin_lock(&fc->lock); |
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) | |||
399 | spin_unlock(&fc->lock); | 400 | spin_unlock(&fc->lock); |
400 | } | 401 | } |
401 | 402 | ||
402 | static void request_send_nowait_locked(struct fuse_conn *fc, | 403 | static void fuse_request_send_nowait_locked(struct fuse_conn *fc, |
403 | struct fuse_req *req) | 404 | struct fuse_req *req) |
404 | { | 405 | { |
405 | req->background = 1; | 406 | req->background = 1; |
406 | fc->num_background++; | 407 | fc->num_background++; |
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc, | |||
414 | flush_bg_queue(fc); | 415 | flush_bg_queue(fc); |
415 | } | 416 | } |
416 | 417 | ||
417 | static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) | 418 | static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) |
418 | { | 419 | { |
419 | spin_lock(&fc->lock); | 420 | spin_lock(&fc->lock); |
420 | if (fc->connected) { | 421 | if (fc->connected) { |
421 | request_send_nowait_locked(fc, req); | 422 | fuse_request_send_nowait_locked(fc, req); |
422 | spin_unlock(&fc->lock); | 423 | spin_unlock(&fc->lock); |
423 | } else { | 424 | } else { |
424 | req->out.h.error = -ENOTCONN; | 425 | req->out.h.error = -ENOTCONN; |
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) | |||
426 | } | 427 | } |
427 | } | 428 | } |
428 | 429 | ||
429 | void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) | 430 | void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) |
430 | { | 431 | { |
431 | req->isreply = 0; | 432 | req->isreply = 0; |
432 | request_send_nowait(fc, req); | 433 | fuse_request_send_nowait(fc, req); |
433 | } | 434 | } |
434 | 435 | ||
435 | void request_send_background(struct fuse_conn *fc, struct fuse_req *req) | 436 | void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) |
436 | { | 437 | { |
437 | req->isreply = 1; | 438 | req->isreply = 1; |
438 | request_send_nowait(fc, req); | 439 | fuse_request_send_nowait(fc, req); |
439 | } | 440 | } |
440 | 441 | ||
441 | /* | 442 | /* |
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) | |||
443 | * | 444 | * |
444 | * fc->connected must have been checked previously | 445 | * fc->connected must have been checked previously |
445 | */ | 446 | */ |
446 | void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) | 447 | void fuse_request_send_background_locked(struct fuse_conn *fc, |
448 | struct fuse_req *req) | ||
447 | { | 449 | { |
448 | req->isreply = 1; | 450 | req->isreply = 1; |
449 | request_send_nowait_locked(fc, req); | 451 | fuse_request_send_nowait_locked(fc, req); |
450 | } | 452 | } |
451 | 453 | ||
452 | /* | 454 | /* |
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) | |||
539 | BUG_ON(!cs->nr_segs); | 541 | BUG_ON(!cs->nr_segs); |
540 | cs->seglen = cs->iov[0].iov_len; | 542 | cs->seglen = cs->iov[0].iov_len; |
541 | cs->addr = (unsigned long) cs->iov[0].iov_base; | 543 | cs->addr = (unsigned long) cs->iov[0].iov_base; |
542 | cs->iov ++; | 544 | cs->iov++; |
543 | cs->nr_segs --; | 545 | cs->nr_segs--; |
544 | } | 546 | } |
545 | down_read(¤t->mm->mmap_sem); | 547 | down_read(¤t->mm->mmap_sem); |
546 | err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, | 548 | err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, |
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, | |||
589 | kunmap_atomic(mapaddr, KM_USER1); | 591 | kunmap_atomic(mapaddr, KM_USER1); |
590 | } | 592 | } |
591 | while (count) { | 593 | while (count) { |
592 | int err; | 594 | if (!cs->len) { |
593 | if (!cs->len && (err = fuse_copy_fill(cs))) | 595 | int err = fuse_copy_fill(cs); |
594 | return err; | 596 | if (err) |
597 | return err; | ||
598 | } | ||
595 | if (page) { | 599 | if (page) { |
596 | void *mapaddr = kmap_atomic(page, KM_USER1); | 600 | void *mapaddr = kmap_atomic(page, KM_USER1); |
597 | void *buf = mapaddr + offset; | 601 | void *buf = mapaddr + offset; |
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, | |||
631 | static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) | 635 | static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) |
632 | { | 636 | { |
633 | while (size) { | 637 | while (size) { |
634 | int err; | 638 | if (!cs->len) { |
635 | if (!cs->len && (err = fuse_copy_fill(cs))) | 639 | int err = fuse_copy_fill(cs); |
636 | return err; | 640 | if (err) |
641 | return err; | ||
642 | } | ||
637 | fuse_copy_do(cs, &val, &size); | 643 | fuse_copy_do(cs, &val, &size); |
638 | } | 644 | } |
639 | return 0; | 645 | return 0; |
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc) | |||
664 | 670 | ||
665 | /* Wait until a request is available on the pending list */ | 671 | /* Wait until a request is available on the pending list */ |
666 | static void request_wait(struct fuse_conn *fc) | 672 | static void request_wait(struct fuse_conn *fc) |
673 | __releases(&fc->lock) | ||
674 | __acquires(&fc->lock) | ||
667 | { | 675 | { |
668 | DECLARE_WAITQUEUE(wait, current); | 676 | DECLARE_WAITQUEUE(wait, current); |
669 | 677 | ||
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc) | |||
691 | */ | 699 | */ |
692 | static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, | 700 | static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, |
693 | const struct iovec *iov, unsigned long nr_segs) | 701 | const struct iovec *iov, unsigned long nr_segs) |
694 | __releases(fc->lock) | 702 | __releases(&fc->lock) |
695 | { | 703 | { |
696 | struct fuse_copy_state cs; | 704 | struct fuse_copy_state cs; |
697 | struct fuse_in_header ih; | 705 | struct fuse_in_header ih; |
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, | |||
813 | return err; | 821 | return err; |
814 | } | 822 | } |
815 | 823 | ||
824 | static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, | ||
825 | struct fuse_copy_state *cs) | ||
826 | { | ||
827 | struct fuse_notify_poll_wakeup_out outarg; | ||
828 | int err; | ||
829 | |||
830 | if (size != sizeof(outarg)) | ||
831 | return -EINVAL; | ||
832 | |||
833 | err = fuse_copy_one(cs, &outarg, sizeof(outarg)); | ||
834 | if (err) | ||
835 | return err; | ||
836 | |||
837 | return fuse_notify_poll_wakeup(fc, &outarg); | ||
838 | } | ||
839 | |||
840 | static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, | ||
841 | unsigned int size, struct fuse_copy_state *cs) | ||
842 | { | ||
843 | switch (code) { | ||
844 | case FUSE_NOTIFY_POLL: | ||
845 | return fuse_notify_poll(fc, size, cs); | ||
846 | |||
847 | default: | ||
848 | return -EINVAL; | ||
849 | } | ||
850 | } | ||
851 | |||
816 | /* Look up request on processing list by unique ID */ | 852 | /* Look up request on processing list by unique ID */ |
817 | static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) | 853 | static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) |
818 | { | 854 | { |
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, | |||
876 | err = fuse_copy_one(&cs, &oh, sizeof(oh)); | 912 | err = fuse_copy_one(&cs, &oh, sizeof(oh)); |
877 | if (err) | 913 | if (err) |
878 | goto err_finish; | 914 | goto err_finish; |
915 | |||
916 | err = -EINVAL; | ||
917 | if (oh.len != nbytes) | ||
918 | goto err_finish; | ||
919 | |||
920 | /* | ||
921 | * Zero oh.unique indicates unsolicited notification message | ||
922 | * and error contains notification code. | ||
923 | */ | ||
924 | if (!oh.unique) { | ||
925 | err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); | ||
926 | fuse_copy_finish(&cs); | ||
927 | return err ? err : nbytes; | ||
928 | } | ||
929 | |||
879 | err = -EINVAL; | 930 | err = -EINVAL; |
880 | if (!oh.unique || oh.error <= -1000 || oh.error > 0 || | 931 | if (oh.error <= -1000 || oh.error > 0) |
881 | oh.len != nbytes) | ||
882 | goto err_finish; | 932 | goto err_finish; |
883 | 933 | ||
884 | spin_lock(&fc->lock); | 934 | spin_lock(&fc->lock); |
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) | |||
966 | * This function releases and reacquires fc->lock | 1016 | * This function releases and reacquires fc->lock |
967 | */ | 1017 | */ |
968 | static void end_requests(struct fuse_conn *fc, struct list_head *head) | 1018 | static void end_requests(struct fuse_conn *fc, struct list_head *head) |
1019 | __releases(&fc->lock) | ||
1020 | __acquires(&fc->lock) | ||
969 | { | 1021 | { |
970 | while (!list_empty(head)) { | 1022 | while (!list_empty(head)) { |
971 | struct fuse_req *req; | 1023 | struct fuse_req *req; |
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) | |||
988 | * locked). | 1040 | * locked). |
989 | */ | 1041 | */ |
990 | static void end_io_requests(struct fuse_conn *fc) | 1042 | static void end_io_requests(struct fuse_conn *fc) |
991 | __releases(fc->lock) __acquires(fc->lock) | 1043 | __releases(&fc->lock) |
1044 | __acquires(&fc->lock) | ||
992 | { | 1045 | { |
993 | while (!list_empty(&fc->io)) { | 1046 | while (!list_empty(&fc->io)) { |
994 | struct fuse_req *req = | 1047 | struct fuse_req *req = |
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc) | |||
1002 | wake_up(&req->waitq); | 1055 | wake_up(&req->waitq); |
1003 | if (end) { | 1056 | if (end) { |
1004 | req->end = NULL; | 1057 | req->end = NULL; |
1005 | /* The end function will consume this reference */ | ||
1006 | __fuse_get_request(req); | 1058 | __fuse_get_request(req); |
1007 | spin_unlock(&fc->lock); | 1059 | spin_unlock(&fc->lock); |
1008 | wait_event(req->waitq, !req->locked); | 1060 | wait_event(req->waitq, !req->locked); |
1009 | end(fc, req); | 1061 | end(fc, req); |
1062 | fuse_put_request(fc, req); | ||
1010 | spin_lock(&fc->lock); | 1063 | spin_lock(&fc->lock); |
1011 | } | 1064 | } |
1012 | } | 1065 | } |
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 95bc22bdd060..fdff346e96fd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) | |||
189 | parent = dget_parent(entry); | 189 | parent = dget_parent(entry); |
190 | fuse_lookup_init(fc, req, get_node_id(parent->d_inode), | 190 | fuse_lookup_init(fc, req, get_node_id(parent->d_inode), |
191 | &entry->d_name, &outarg); | 191 | &entry->d_name, &outarg); |
192 | request_send(fc, req); | 192 | fuse_request_send(fc, req); |
193 | dput(parent); | 193 | dput(parent); |
194 | err = req->out.h.error; | 194 | err = req->out.h.error; |
195 | fuse_put_request(fc, req); | 195 | fuse_put_request(fc, req); |
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) | |||
204 | return 0; | 204 | return 0; |
205 | } | 205 | } |
206 | spin_lock(&fc->lock); | 206 | spin_lock(&fc->lock); |
207 | fi->nlookup ++; | 207 | fi->nlookup++; |
208 | spin_unlock(&fc->lock); | 208 | spin_unlock(&fc->lock); |
209 | } | 209 | } |
210 | fuse_put_request(fc, forget_req); | 210 | fuse_put_request(fc, forget_req); |
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, | |||
283 | attr_version = fuse_get_attr_version(fc); | 283 | attr_version = fuse_get_attr_version(fc); |
284 | 284 | ||
285 | fuse_lookup_init(fc, req, nodeid, name, outarg); | 285 | fuse_lookup_init(fc, req, nodeid, name, outarg); |
286 | request_send(fc, req); | 286 | fuse_request_send(fc, req); |
287 | err = req->out.h.error; | 287 | err = req->out.h.error; |
288 | fuse_put_request(fc, req); | 288 | fuse_put_request(fc, req); |
289 | /* Zero nodeid is same as -ENOENT, but with valid timeout */ | 289 | /* Zero nodeid is same as -ENOENT, but with valid timeout */ |
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff, | |||
369 | { | 369 | { |
370 | fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); | 370 | fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); |
371 | ff->reserved_req->force = 1; | 371 | ff->reserved_req->force = 1; |
372 | request_send(fc, ff->reserved_req); | 372 | fuse_request_send(fc, ff->reserved_req); |
373 | fuse_put_request(fc, ff->reserved_req); | 373 | fuse_put_request(fc, ff->reserved_req); |
374 | kfree(ff); | 374 | kfree(ff); |
375 | } | 375 | } |
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, | |||
408 | goto out_put_forget_req; | 408 | goto out_put_forget_req; |
409 | 409 | ||
410 | err = -ENOMEM; | 410 | err = -ENOMEM; |
411 | ff = fuse_file_alloc(); | 411 | ff = fuse_file_alloc(fc); |
412 | if (!ff) | 412 | if (!ff) |
413 | goto out_put_request; | 413 | goto out_put_request; |
414 | 414 | ||
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, | |||
432 | req->out.args[0].value = &outentry; | 432 | req->out.args[0].value = &outentry; |
433 | req->out.args[1].size = sizeof(outopen); | 433 | req->out.args[1].size = sizeof(outopen); |
434 | req->out.args[1].value = &outopen; | 434 | req->out.args[1].value = &outopen; |
435 | request_send(fc, req); | 435 | fuse_request_send(fc, req); |
436 | err = req->out.h.error; | 436 | err = req->out.h.error; |
437 | if (err) { | 437 | if (err) { |
438 | if (err == -ENOSYS) | 438 | if (err == -ENOSYS) |
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, | |||
502 | else | 502 | else |
503 | req->out.args[0].size = sizeof(outarg); | 503 | req->out.args[0].size = sizeof(outarg); |
504 | req->out.args[0].value = &outarg; | 504 | req->out.args[0].value = &outarg; |
505 | request_send(fc, req); | 505 | fuse_request_send(fc, req); |
506 | err = req->out.h.error; | 506 | err = req->out.h.error; |
507 | fuse_put_request(fc, req); | 507 | fuse_put_request(fc, req); |
508 | if (err) | 508 | if (err) |
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) | |||
631 | req->in.numargs = 1; | 631 | req->in.numargs = 1; |
632 | req->in.args[0].size = entry->d_name.len + 1; | 632 | req->in.args[0].size = entry->d_name.len + 1; |
633 | req->in.args[0].value = entry->d_name.name; | 633 | req->in.args[0].value = entry->d_name.name; |
634 | request_send(fc, req); | 634 | fuse_request_send(fc, req); |
635 | err = req->out.h.error; | 635 | err = req->out.h.error; |
636 | fuse_put_request(fc, req); | 636 | fuse_put_request(fc, req); |
637 | if (!err) { | 637 | if (!err) { |
638 | struct inode *inode = entry->d_inode; | 638 | struct inode *inode = entry->d_inode; |
639 | 639 | ||
640 | /* Set nlink to zero so the inode can be cleared, if | 640 | /* |
641 | the inode does have more links this will be | 641 | * Set nlink to zero so the inode can be cleared, if the inode |
642 | discovered at the next lookup/getattr */ | 642 | * does have more links this will be discovered at the next |
643 | * lookup/getattr. | ||
644 | */ | ||
643 | clear_nlink(inode); | 645 | clear_nlink(inode); |
644 | fuse_invalidate_attr(inode); | 646 | fuse_invalidate_attr(inode); |
645 | fuse_invalidate_attr(dir); | 647 | fuse_invalidate_attr(dir); |
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) | |||
662 | req->in.numargs = 1; | 664 | req->in.numargs = 1; |
663 | req->in.args[0].size = entry->d_name.len + 1; | 665 | req->in.args[0].size = entry->d_name.len + 1; |
664 | req->in.args[0].value = entry->d_name.name; | 666 | req->in.args[0].value = entry->d_name.name; |
665 | request_send(fc, req); | 667 | fuse_request_send(fc, req); |
666 | err = req->out.h.error; | 668 | err = req->out.h.error; |
667 | fuse_put_request(fc, req); | 669 | fuse_put_request(fc, req); |
668 | if (!err) { | 670 | if (!err) { |
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, | |||
695 | req->in.args[1].value = oldent->d_name.name; | 697 | req->in.args[1].value = oldent->d_name.name; |
696 | req->in.args[2].size = newent->d_name.len + 1; | 698 | req->in.args[2].size = newent->d_name.len + 1; |
697 | req->in.args[2].value = newent->d_name.name; | 699 | req->in.args[2].value = newent->d_name.name; |
698 | request_send(fc, req); | 700 | fuse_request_send(fc, req); |
699 | err = req->out.h.error; | 701 | err = req->out.h.error; |
700 | fuse_put_request(fc, req); | 702 | fuse_put_request(fc, req); |
701 | if (!err) { | 703 | if (!err) { |
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, | |||
811 | else | 813 | else |
812 | req->out.args[0].size = sizeof(outarg); | 814 | req->out.args[0].size = sizeof(outarg); |
813 | req->out.args[0].value = &outarg; | 815 | req->out.args[0].value = &outarg; |
814 | request_send(fc, req); | 816 | fuse_request_send(fc, req); |
815 | err = req->out.h.error; | 817 | err = req->out.h.error; |
816 | fuse_put_request(fc, req); | 818 | fuse_put_request(fc, req); |
817 | if (!err) { | 819 | if (!err) { |
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask) | |||
911 | req->in.numargs = 1; | 913 | req->in.numargs = 1; |
912 | req->in.args[0].size = sizeof(inarg); | 914 | req->in.args[0].size = sizeof(inarg); |
913 | req->in.args[0].value = &inarg; | 915 | req->in.args[0].value = &inarg; |
914 | request_send(fc, req); | 916 | fuse_request_send(fc, req); |
915 | err = req->out.h.error; | 917 | err = req->out.h.error; |
916 | fuse_put_request(fc, req); | 918 | fuse_put_request(fc, req); |
917 | if (err == -ENOSYS) { | 919 | if (err == -ENOSYS) { |
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) | |||
1033 | req->num_pages = 1; | 1035 | req->num_pages = 1; |
1034 | req->pages[0] = page; | 1036 | req->pages[0] = page; |
1035 | fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); | 1037 | fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); |
1036 | request_send(fc, req); | 1038 | fuse_request_send(fc, req); |
1037 | nbytes = req->out.args[0].size; | 1039 | nbytes = req->out.args[0].size; |
1038 | err = req->out.h.error; | 1040 | err = req->out.h.error; |
1039 | fuse_put_request(fc, req); | 1041 | fuse_put_request(fc, req); |
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry) | |||
1067 | req->out.numargs = 1; | 1069 | req->out.numargs = 1; |
1068 | req->out.args[0].size = PAGE_SIZE - 1; | 1070 | req->out.args[0].size = PAGE_SIZE - 1; |
1069 | req->out.args[0].value = link; | 1071 | req->out.args[0].value = link; |
1070 | request_send(fc, req); | 1072 | fuse_request_send(fc, req); |
1071 | if (req->out.h.error) { | 1073 | if (req->out.h.error) { |
1072 | free_page((unsigned long) link); | 1074 | free_page((unsigned long) link); |
1073 | link = ERR_PTR(req->out.h.error); | 1075 | link = ERR_PTR(req->out.h.error); |
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, | |||
1273 | else | 1275 | else |
1274 | req->out.args[0].size = sizeof(outarg); | 1276 | req->out.args[0].size = sizeof(outarg); |
1275 | req->out.args[0].value = &outarg; | 1277 | req->out.args[0].value = &outarg; |
1276 | request_send(fc, req); | 1278 | fuse_request_send(fc, req); |
1277 | err = req->out.h.error; | 1279 | err = req->out.h.error; |
1278 | fuse_put_request(fc, req); | 1280 | fuse_put_request(fc, req); |
1279 | if (err) { | 1281 | if (err) { |
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, | |||
1367 | req->in.args[1].value = name; | 1369 | req->in.args[1].value = name; |
1368 | req->in.args[2].size = size; | 1370 | req->in.args[2].size = size; |
1369 | req->in.args[2].value = value; | 1371 | req->in.args[2].value = value; |
1370 | request_send(fc, req); | 1372 | fuse_request_send(fc, req); |
1371 | err = req->out.h.error; | 1373 | err = req->out.h.error; |
1372 | fuse_put_request(fc, req); | 1374 | fuse_put_request(fc, req); |
1373 | if (err == -ENOSYS) { | 1375 | if (err == -ENOSYS) { |
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, | |||
1413 | req->out.args[0].size = sizeof(outarg); | 1415 | req->out.args[0].size = sizeof(outarg); |
1414 | req->out.args[0].value = &outarg; | 1416 | req->out.args[0].value = &outarg; |
1415 | } | 1417 | } |
1416 | request_send(fc, req); | 1418 | fuse_request_send(fc, req); |
1417 | ret = req->out.h.error; | 1419 | ret = req->out.h.error; |
1418 | if (!ret) | 1420 | if (!ret) |
1419 | ret = size ? req->out.args[0].size : outarg.size; | 1421 | ret = size ? req->out.args[0].size : outarg.size; |
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) | |||
1463 | req->out.args[0].size = sizeof(outarg); | 1465 | req->out.args[0].size = sizeof(outarg); |
1464 | req->out.args[0].value = &outarg; | 1466 | req->out.args[0].value = &outarg; |
1465 | } | 1467 | } |
1466 | request_send(fc, req); | 1468 | fuse_request_send(fc, req); |
1467 | ret = req->out.h.error; | 1469 | ret = req->out.h.error; |
1468 | if (!ret) | 1470 | if (!ret) |
1469 | ret = size ? req->out.args[0].size : outarg.size; | 1471 | ret = size ? req->out.args[0].size : outarg.size; |
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) | |||
1496 | req->in.numargs = 1; | 1498 | req->in.numargs = 1; |
1497 | req->in.args[0].size = strlen(name) + 1; | 1499 | req->in.args[0].size = strlen(name) + 1; |
1498 | req->in.args[0].value = name; | 1500 | req->in.args[0].value = name; |
1499 | request_send(fc, req); | 1501 | fuse_request_send(fc, req); |
1500 | err = req->out.h.error; | 1502 | err = req->out.h.error; |
1501 | fuse_put_request(fc, req); | 1503 | fuse_put_request(fc, req); |
1502 | if (err == -ENOSYS) { | 1504 | if (err == -ENOSYS) { |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4c9ee7011265..e8162646a9b5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, | |||
39 | req->out.numargs = 1; | 39 | req->out.numargs = 1; |
40 | req->out.args[0].size = sizeof(*outargp); | 40 | req->out.args[0].size = sizeof(*outargp); |
41 | req->out.args[0].value = outargp; | 41 | req->out.args[0].value = outargp; |
42 | request_send(fc, req); | 42 | fuse_request_send(fc, req); |
43 | err = req->out.h.error; | 43 | err = req->out.h.error; |
44 | fuse_put_request(fc, req); | 44 | fuse_put_request(fc, req); |
45 | 45 | ||
46 | return err; | 46 | return err; |
47 | } | 47 | } |
48 | 48 | ||
49 | struct fuse_file *fuse_file_alloc(void) | 49 | struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) |
50 | { | 50 | { |
51 | struct fuse_file *ff; | 51 | struct fuse_file *ff; |
52 | ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); | 52 | ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); |
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void) | |||
58 | } else { | 58 | } else { |
59 | INIT_LIST_HEAD(&ff->write_entry); | 59 | INIT_LIST_HEAD(&ff->write_entry); |
60 | atomic_set(&ff->count, 0); | 60 | atomic_set(&ff->count, 0); |
61 | spin_lock(&fc->lock); | ||
62 | ff->kh = ++fc->khctr; | ||
63 | spin_unlock(&fc->lock); | ||
61 | } | 64 | } |
65 | RB_CLEAR_NODE(&ff->polled_node); | ||
66 | init_waitqueue_head(&ff->poll_wait); | ||
62 | } | 67 | } |
63 | return ff; | 68 | return ff; |
64 | } | 69 | } |
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) | |||
79 | { | 84 | { |
80 | dput(req->misc.release.dentry); | 85 | dput(req->misc.release.dentry); |
81 | mntput(req->misc.release.vfsmount); | 86 | mntput(req->misc.release.vfsmount); |
82 | fuse_put_request(fc, req); | ||
83 | } | 87 | } |
84 | 88 | ||
85 | static void fuse_file_put(struct fuse_file *ff) | 89 | static void fuse_file_put(struct fuse_file *ff) |
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff) | |||
89 | struct inode *inode = req->misc.release.dentry->d_inode; | 93 | struct inode *inode = req->misc.release.dentry->d_inode; |
90 | struct fuse_conn *fc = get_fuse_conn(inode); | 94 | struct fuse_conn *fc = get_fuse_conn(inode); |
91 | req->end = fuse_release_end; | 95 | req->end = fuse_release_end; |
92 | request_send_background(fc, req); | 96 | fuse_request_send_background(fc, req); |
93 | kfree(ff); | 97 | kfree(ff); |
94 | } | 98 | } |
95 | } | 99 | } |
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file, | |||
109 | 113 | ||
110 | int fuse_open_common(struct inode *inode, struct file *file, int isdir) | 114 | int fuse_open_common(struct inode *inode, struct file *file, int isdir) |
111 | { | 115 | { |
116 | struct fuse_conn *fc = get_fuse_conn(inode); | ||
112 | struct fuse_open_out outarg; | 117 | struct fuse_open_out outarg; |
113 | struct fuse_file *ff; | 118 | struct fuse_file *ff; |
114 | int err; | 119 | int err; |
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) | |||
121 | if (err) | 126 | if (err) |
122 | return err; | 127 | return err; |
123 | 128 | ||
124 | ff = fuse_file_alloc(); | 129 | ff = fuse_file_alloc(fc); |
125 | if (!ff) | 130 | if (!ff) |
126 | return -ENOMEM; | 131 | return -ENOMEM; |
127 | 132 | ||
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir) | |||
167 | 172 | ||
168 | spin_lock(&fc->lock); | 173 | spin_lock(&fc->lock); |
169 | list_del(&ff->write_entry); | 174 | list_del(&ff->write_entry); |
175 | if (!RB_EMPTY_NODE(&ff->polled_node)) | ||
176 | rb_erase(&ff->polled_node, &fc->polled_files); | ||
170 | spin_unlock(&fc->lock); | 177 | spin_unlock(&fc->lock); |
178 | |||
179 | wake_up_interruptible_sync(&ff->poll_wait); | ||
171 | /* | 180 | /* |
172 | * Normally this will send the RELEASE request, | 181 | * Normally this will send the RELEASE request, |
173 | * however if some asynchronous READ or WRITE requests | 182 | * however if some asynchronous READ or WRITE requests |
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) | |||
280 | req->in.args[0].size = sizeof(inarg); | 289 | req->in.args[0].size = sizeof(inarg); |
281 | req->in.args[0].value = &inarg; | 290 | req->in.args[0].value = &inarg; |
282 | req->force = 1; | 291 | req->force = 1; |
283 | request_send(fc, req); | 292 | fuse_request_send(fc, req); |
284 | err = req->out.h.error; | 293 | err = req->out.h.error; |
285 | fuse_put_request(fc, req); | 294 | fuse_put_request(fc, req); |
286 | if (err == -ENOSYS) { | 295 | if (err == -ENOSYS) { |
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, | |||
344 | req->in.numargs = 1; | 353 | req->in.numargs = 1; |
345 | req->in.args[0].size = sizeof(inarg); | 354 | req->in.args[0].size = sizeof(inarg); |
346 | req->in.args[0].value = &inarg; | 355 | req->in.args[0].value = &inarg; |
347 | request_send(fc, req); | 356 | fuse_request_send(fc, req); |
348 | err = req->out.h.error; | 357 | err = req->out.h.error; |
349 | fuse_put_request(fc, req); | 358 | fuse_put_request(fc, req); |
350 | if (err == -ENOSYS) { | 359 | if (err == -ENOSYS) { |
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, | |||
396 | inarg->read_flags |= FUSE_READ_LOCKOWNER; | 405 | inarg->read_flags |= FUSE_READ_LOCKOWNER; |
397 | inarg->lock_owner = fuse_lock_owner_id(fc, owner); | 406 | inarg->lock_owner = fuse_lock_owner_id(fc, owner); |
398 | } | 407 | } |
399 | request_send(fc, req); | 408 | fuse_request_send(fc, req); |
400 | return req->out.args[0].size; | 409 | return req->out.args[0].size; |
401 | } | 410 | } |
402 | 411 | ||
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) | |||
493 | } | 502 | } |
494 | if (req->ff) | 503 | if (req->ff) |
495 | fuse_file_put(req->ff); | 504 | fuse_file_put(req->ff); |
496 | fuse_put_request(fc, req); | ||
497 | } | 505 | } |
498 | 506 | ||
499 | static void fuse_send_readpages(struct fuse_req *req, struct file *file, | 507 | static void fuse_send_readpages(struct fuse_req *req, struct file *file, |
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, | |||
509 | struct fuse_file *ff = file->private_data; | 517 | struct fuse_file *ff = file->private_data; |
510 | req->ff = fuse_file_get(ff); | 518 | req->ff = fuse_file_get(ff); |
511 | req->end = fuse_readpages_end; | 519 | req->end = fuse_readpages_end; |
512 | request_send_background(fc, req); | 520 | fuse_request_send_background(fc, req); |
513 | } else { | 521 | } else { |
514 | request_send(fc, req); | 522 | fuse_request_send(fc, req); |
515 | fuse_readpages_end(fc, req); | 523 | fuse_readpages_end(fc, req); |
524 | fuse_put_request(fc, req); | ||
516 | } | 525 | } |
517 | } | 526 | } |
518 | 527 | ||
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) | |||
543 | } | 552 | } |
544 | } | 553 | } |
545 | req->pages[req->num_pages] = page; | 554 | req->pages[req->num_pages] = page; |
546 | req->num_pages ++; | 555 | req->num_pages++; |
547 | return 0; | 556 | return 0; |
548 | } | 557 | } |
549 | 558 | ||
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, | |||
636 | inarg->write_flags |= FUSE_WRITE_LOCKOWNER; | 645 | inarg->write_flags |= FUSE_WRITE_LOCKOWNER; |
637 | inarg->lock_owner = fuse_lock_owner_id(fc, owner); | 646 | inarg->lock_owner = fuse_lock_owner_id(fc, owner); |
638 | } | 647 | } |
639 | request_send(fc, req); | 648 | fuse_request_send(fc, req); |
640 | return req->misc.write.out.size; | 649 | return req->misc.write.out.size; |
641 | } | 650 | } |
642 | 651 | ||
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) | |||
1042 | { | 1051 | { |
1043 | __free_page(req->pages[0]); | 1052 | __free_page(req->pages[0]); |
1044 | fuse_file_put(req->ff); | 1053 | fuse_file_put(req->ff); |
1045 | fuse_put_request(fc, req); | ||
1046 | } | 1054 | } |
1047 | 1055 | ||
1048 | static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) | 1056 | static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) |
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) | |||
1060 | 1068 | ||
1061 | /* Called under fc->lock, may release and reacquire it */ | 1069 | /* Called under fc->lock, may release and reacquire it */ |
1062 | static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) | 1070 | static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) |
1071 | __releases(&fc->lock) | ||
1072 | __acquires(&fc->lock) | ||
1063 | { | 1073 | { |
1064 | struct fuse_inode *fi = get_fuse_inode(req->inode); | 1074 | struct fuse_inode *fi = get_fuse_inode(req->inode); |
1065 | loff_t size = i_size_read(req->inode); | 1075 | loff_t size = i_size_read(req->inode); |
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) | |||
1079 | 1089 | ||
1080 | req->in.args[1].size = inarg->size; | 1090 | req->in.args[1].size = inarg->size; |
1081 | fi->writectr++; | 1091 | fi->writectr++; |
1082 | request_send_background_locked(fc, req); | 1092 | fuse_request_send_background_locked(fc, req); |
1083 | return; | 1093 | return; |
1084 | 1094 | ||
1085 | out_free: | 1095 | out_free: |
1086 | fuse_writepage_finish(fc, req); | 1096 | fuse_writepage_finish(fc, req); |
1087 | spin_unlock(&fc->lock); | 1097 | spin_unlock(&fc->lock); |
1088 | fuse_writepage_free(fc, req); | 1098 | fuse_writepage_free(fc, req); |
1099 | fuse_put_request(fc, req); | ||
1089 | spin_lock(&fc->lock); | 1100 | spin_lock(&fc->lock); |
1090 | } | 1101 | } |
1091 | 1102 | ||
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) | |||
1096 | * Called with fc->lock | 1107 | * Called with fc->lock |
1097 | */ | 1108 | */ |
1098 | void fuse_flush_writepages(struct inode *inode) | 1109 | void fuse_flush_writepages(struct inode *inode) |
1110 | __releases(&fc->lock) | ||
1111 | __acquires(&fc->lock) | ||
1099 | { | 1112 | { |
1100 | struct fuse_conn *fc = get_fuse_conn(inode); | 1113 | struct fuse_conn *fc = get_fuse_conn(inode); |
1101 | struct fuse_inode *fi = get_fuse_inode(inode); | 1114 | struct fuse_inode *fi = get_fuse_inode(inode); |
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) | |||
1325 | req->out.numargs = 1; | 1338 | req->out.numargs = 1; |
1326 | req->out.args[0].size = sizeof(outarg); | 1339 | req->out.args[0].size = sizeof(outarg); |
1327 | req->out.args[0].value = &outarg; | 1340 | req->out.args[0].value = &outarg; |
1328 | request_send(fc, req); | 1341 | fuse_request_send(fc, req); |
1329 | err = req->out.h.error; | 1342 | err = req->out.h.error; |
1330 | fuse_put_request(fc, req); | 1343 | fuse_put_request(fc, req); |
1331 | if (!err) | 1344 | if (!err) |
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) | |||
1357 | return PTR_ERR(req); | 1370 | return PTR_ERR(req); |
1358 | 1371 | ||
1359 | fuse_lk_fill(req, file, fl, opcode, pid, flock); | 1372 | fuse_lk_fill(req, file, fl, opcode, pid, flock); |
1360 | request_send(fc, req); | 1373 | fuse_request_send(fc, req); |
1361 | err = req->out.h.error; | 1374 | err = req->out.h.error; |
1362 | /* locking is restartable */ | 1375 | /* locking is restartable */ |
1363 | if (err == -EINTR) | 1376 | if (err == -EINTR) |
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) | |||
1433 | req->out.numargs = 1; | 1446 | req->out.numargs = 1; |
1434 | req->out.args[0].size = sizeof(outarg); | 1447 | req->out.args[0].size = sizeof(outarg); |
1435 | req->out.args[0].value = &outarg; | 1448 | req->out.args[0].value = &outarg; |
1436 | request_send(fc, req); | 1449 | fuse_request_send(fc, req); |
1437 | err = req->out.h.error; | 1450 | err = req->out.h.error; |
1438 | fuse_put_request(fc, req); | 1451 | fuse_put_request(fc, req); |
1439 | if (err == -ENOSYS) | 1452 | if (err == -ENOSYS) |
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) | |||
1470 | return retval; | 1483 | return retval; |
1471 | } | 1484 | } |
1472 | 1485 | ||
1486 | static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, | ||
1487 | unsigned int nr_segs, size_t bytes, bool to_user) | ||
1488 | { | ||
1489 | struct iov_iter ii; | ||
1490 | int page_idx = 0; | ||
1491 | |||
1492 | if (!bytes) | ||
1493 | return 0; | ||
1494 | |||
1495 | iov_iter_init(&ii, iov, nr_segs, bytes, 0); | ||
1496 | |||
1497 | while (iov_iter_count(&ii)) { | ||
1498 | struct page *page = pages[page_idx++]; | ||
1499 | size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); | ||
1500 | void *kaddr, *map; | ||
1501 | |||
1502 | kaddr = map = kmap(page); | ||
1503 | |||
1504 | while (todo) { | ||
1505 | char __user *uaddr = ii.iov->iov_base + ii.iov_offset; | ||
1506 | size_t iov_len = ii.iov->iov_len - ii.iov_offset; | ||
1507 | size_t copy = min(todo, iov_len); | ||
1508 | size_t left; | ||
1509 | |||
1510 | if (!to_user) | ||
1511 | left = copy_from_user(kaddr, uaddr, copy); | ||
1512 | else | ||
1513 | left = copy_to_user(uaddr, kaddr, copy); | ||
1514 | |||
1515 | if (unlikely(left)) | ||
1516 | return -EFAULT; | ||
1517 | |||
1518 | iov_iter_advance(&ii, copy); | ||
1519 | todo -= copy; | ||
1520 | kaddr += copy; | ||
1521 | } | ||
1522 | |||
1523 | kunmap(map); | ||
1524 | } | ||
1525 | |||
1526 | return 0; | ||
1527 | } | ||
1528 | |||
1529 | /* | ||
1530 | * For ioctls, there is no generic way to determine how much memory | ||
1531 | * needs to be read and/or written. Furthermore, ioctls are allowed | ||
1532 | * to dereference the passed pointer, so the parameter requires deep | ||
1533 | * copying but FUSE has no idea whatsoever about what to copy in or | ||
1534 | * out. | ||
1535 | * | ||
1536 | * This is solved by allowing FUSE server to retry ioctl with | ||
1537 | * necessary in/out iovecs. Let's assume the ioctl implementation | ||
1538 | * needs to read in the following structure. | ||
1539 | * | ||
1540 | * struct a { | ||
1541 | * char *buf; | ||
1542 | * size_t buflen; | ||
1543 | * } | ||
1544 | * | ||
1545 | * On the first callout to FUSE server, inarg->in_size and | ||
1546 | * inarg->out_size will be NULL; then, the server completes the ioctl | ||
1547 | * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and | ||
1548 | * the actual iov array to | ||
1549 | * | ||
1550 | * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } | ||
1551 | * | ||
1552 | * which tells FUSE to copy in the requested area and retry the ioctl. | ||
1553 | * On the second round, the server has access to the structure and | ||
1554 | * from that it can tell what to look for next, so on the invocation, | ||
1555 | * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to | ||
1556 | * | ||
1557 | * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, | ||
1558 | * { .iov_base = a.buf, .iov_len = a.buflen } } | ||
1559 | * | ||
1560 | * FUSE will copy both struct a and the pointed buffer from the | ||
1561 | * process doing the ioctl and retry ioctl with both struct a and the | ||
1562 | * buffer. | ||
1563 | * | ||
1564 | * This time, FUSE server has everything it needs and completes ioctl | ||
1565 | * without FUSE_IOCTL_RETRY which finishes the ioctl call. | ||
1566 | * | ||
1567 | * Copying data out works the same way. | ||
1568 | * | ||
1569 | * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel | ||
1570 | * automatically initializes in and out iovs by decoding @cmd with | ||
1571 | * _IOC_* macros and the server is not allowed to request RETRY. This | ||
1572 | * limits ioctl data transfers to well-formed ioctls and is the forced | ||
1573 | * behavior for all FUSE servers. | ||
1574 | */ | ||
1575 | static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, | ||
1576 | unsigned long arg, unsigned int flags) | ||
1577 | { | ||
1578 | struct inode *inode = file->f_dentry->d_inode; | ||
1579 | struct fuse_file *ff = file->private_data; | ||
1580 | struct fuse_conn *fc = get_fuse_conn(inode); | ||
1581 | struct fuse_ioctl_in inarg = { | ||
1582 | .fh = ff->fh, | ||
1583 | .cmd = cmd, | ||
1584 | .arg = arg, | ||
1585 | .flags = flags | ||
1586 | }; | ||
1587 | struct fuse_ioctl_out outarg; | ||
1588 | struct fuse_req *req = NULL; | ||
1589 | struct page **pages = NULL; | ||
1590 | struct page *iov_page = NULL; | ||
1591 | struct iovec *in_iov = NULL, *out_iov = NULL; | ||
1592 | unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; | ||
1593 | size_t in_size, out_size, transferred; | ||
1594 | int err; | ||
1595 | |||
1596 | /* assume all the iovs returned by client always fits in a page */ | ||
1597 | BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); | ||
1598 | |||
1599 | if (!fuse_allow_task(fc, current)) | ||
1600 | return -EACCES; | ||
1601 | |||
1602 | err = -EIO; | ||
1603 | if (is_bad_inode(inode)) | ||
1604 | goto out; | ||
1605 | |||
1606 | err = -ENOMEM; | ||
1607 | pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); | ||
1608 | iov_page = alloc_page(GFP_KERNEL); | ||
1609 | if (!pages || !iov_page) | ||
1610 | goto out; | ||
1611 | |||
1612 | /* | ||
1613 | * If restricted, initialize IO parameters as encoded in @cmd. | ||
1614 | * RETRY from server is not allowed. | ||
1615 | */ | ||
1616 | if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { | ||
1617 | struct iovec *iov = page_address(iov_page); | ||
1618 | |||
1619 | iov->iov_base = (void __user *)arg; | ||
1620 | iov->iov_len = _IOC_SIZE(cmd); | ||
1621 | |||
1622 | if (_IOC_DIR(cmd) & _IOC_WRITE) { | ||
1623 | in_iov = iov; | ||
1624 | in_iovs = 1; | ||
1625 | } | ||
1626 | |||
1627 | if (_IOC_DIR(cmd) & _IOC_READ) { | ||
1628 | out_iov = iov; | ||
1629 | out_iovs = 1; | ||
1630 | } | ||
1631 | } | ||
1632 | |||
1633 | retry: | ||
1634 | inarg.in_size = in_size = iov_length(in_iov, in_iovs); | ||
1635 | inarg.out_size = out_size = iov_length(out_iov, out_iovs); | ||
1636 | |||
1637 | /* | ||
1638 | * Out data can be used either for actual out data or iovs, | ||
1639 | * make sure there always is at least one page. | ||
1640 | */ | ||
1641 | out_size = max_t(size_t, out_size, PAGE_SIZE); | ||
1642 | max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); | ||
1643 | |||
1644 | /* make sure there are enough buffer pages and init request with them */ | ||
1645 | err = -ENOMEM; | ||
1646 | if (max_pages > FUSE_MAX_PAGES_PER_REQ) | ||
1647 | goto out; | ||
1648 | while (num_pages < max_pages) { | ||
1649 | pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
1650 | if (!pages[num_pages]) | ||
1651 | goto out; | ||
1652 | num_pages++; | ||
1653 | } | ||
1654 | |||
1655 | req = fuse_get_req(fc); | ||
1656 | if (IS_ERR(req)) { | ||
1657 | err = PTR_ERR(req); | ||
1658 | req = NULL; | ||
1659 | goto out; | ||
1660 | } | ||
1661 | memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); | ||
1662 | req->num_pages = num_pages; | ||
1663 | |||
1664 | /* okay, let's send it to the client */ | ||
1665 | req->in.h.opcode = FUSE_IOCTL; | ||
1666 | req->in.h.nodeid = get_node_id(inode); | ||
1667 | req->in.numargs = 1; | ||
1668 | req->in.args[0].size = sizeof(inarg); | ||
1669 | req->in.args[0].value = &inarg; | ||
1670 | if (in_size) { | ||
1671 | req->in.numargs++; | ||
1672 | req->in.args[1].size = in_size; | ||
1673 | req->in.argpages = 1; | ||
1674 | |||
1675 | err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, | ||
1676 | false); | ||
1677 | if (err) | ||
1678 | goto out; | ||
1679 | } | ||
1680 | |||
1681 | req->out.numargs = 2; | ||
1682 | req->out.args[0].size = sizeof(outarg); | ||
1683 | req->out.args[0].value = &outarg; | ||
1684 | req->out.args[1].size = out_size; | ||
1685 | req->out.argpages = 1; | ||
1686 | req->out.argvar = 1; | ||
1687 | |||
1688 | fuse_request_send(fc, req); | ||
1689 | err = req->out.h.error; | ||
1690 | transferred = req->out.args[1].size; | ||
1691 | fuse_put_request(fc, req); | ||
1692 | req = NULL; | ||
1693 | if (err) | ||
1694 | goto out; | ||
1695 | |||
1696 | /* did it ask for retry? */ | ||
1697 | if (outarg.flags & FUSE_IOCTL_RETRY) { | ||
1698 | char *vaddr; | ||
1699 | |||
1700 | /* no retry if in restricted mode */ | ||
1701 | err = -EIO; | ||
1702 | if (!(flags & FUSE_IOCTL_UNRESTRICTED)) | ||
1703 | goto out; | ||
1704 | |||
1705 | in_iovs = outarg.in_iovs; | ||
1706 | out_iovs = outarg.out_iovs; | ||
1707 | |||
1708 | /* | ||
1709 | * Make sure things are in boundary, separate checks | ||
1710 | * are to protect against overflow. | ||
1711 | */ | ||
1712 | err = -ENOMEM; | ||
1713 | if (in_iovs > FUSE_IOCTL_MAX_IOV || | ||
1714 | out_iovs > FUSE_IOCTL_MAX_IOV || | ||
1715 | in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) | ||
1716 | goto out; | ||
1717 | |||
1718 | err = -EIO; | ||
1719 | if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) | ||
1720 | goto out; | ||
1721 | |||
1722 | /* okay, copy in iovs and retry */ | ||
1723 | vaddr = kmap_atomic(pages[0], KM_USER0); | ||
1724 | memcpy(page_address(iov_page), vaddr, transferred); | ||
1725 | kunmap_atomic(vaddr, KM_USER0); | ||
1726 | |||
1727 | in_iov = page_address(iov_page); | ||
1728 | out_iov = in_iov + in_iovs; | ||
1729 | |||
1730 | goto retry; | ||
1731 | } | ||
1732 | |||
1733 | err = -EIO; | ||
1734 | if (transferred > inarg.out_size) | ||
1735 | goto out; | ||
1736 | |||
1737 | err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); | ||
1738 | out: | ||
1739 | if (req) | ||
1740 | fuse_put_request(fc, req); | ||
1741 | if (iov_page) | ||
1742 | __free_page(iov_page); | ||
1743 | while (num_pages) | ||
1744 | __free_page(pages[--num_pages]); | ||
1745 | kfree(pages); | ||
1746 | |||
1747 | return err ? err : outarg.result; | ||
1748 | } | ||
1749 | |||
1750 | static long fuse_file_ioctl(struct file *file, unsigned int cmd, | ||
1751 | unsigned long arg) | ||
1752 | { | ||
1753 | return fuse_file_do_ioctl(file, cmd, arg, 0); | ||
1754 | } | ||
1755 | |||
1756 | static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, | ||
1757 | unsigned long arg) | ||
1758 | { | ||
1759 | return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); | ||
1760 | } | ||
1761 | |||
1762 | /* | ||
1763 | * All files which have been polled are linked to RB tree | ||
1764 | * fuse_conn->polled_files which is indexed by kh. Walk the tree and | ||
1765 | * find the matching one. | ||
1766 | */ | ||
1767 | static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, | ||
1768 | struct rb_node **parent_out) | ||
1769 | { | ||
1770 | struct rb_node **link = &fc->polled_files.rb_node; | ||
1771 | struct rb_node *last = NULL; | ||
1772 | |||
1773 | while (*link) { | ||
1774 | struct fuse_file *ff; | ||
1775 | |||
1776 | last = *link; | ||
1777 | ff = rb_entry(last, struct fuse_file, polled_node); | ||
1778 | |||
1779 | if (kh < ff->kh) | ||
1780 | link = &last->rb_left; | ||
1781 | else if (kh > ff->kh) | ||
1782 | link = &last->rb_right; | ||
1783 | else | ||
1784 | return link; | ||
1785 | } | ||
1786 | |||
1787 | if (parent_out) | ||
1788 | *parent_out = last; | ||
1789 | return link; | ||
1790 | } | ||
1791 | |||
1792 | /* | ||
1793 | * The file is about to be polled. Make sure it's on the polled_files | ||
1794 | * RB tree. Note that files once added to the polled_files tree are | ||
1795 | * not removed before the file is released. This is because a file | ||
1796 | * polled once is likely to be polled again. | ||
1797 | */ | ||
1798 | static void fuse_register_polled_file(struct fuse_conn *fc, | ||
1799 | struct fuse_file *ff) | ||
1800 | { | ||
1801 | spin_lock(&fc->lock); | ||
1802 | if (RB_EMPTY_NODE(&ff->polled_node)) { | ||
1803 | struct rb_node **link, *parent; | ||
1804 | |||
1805 | link = fuse_find_polled_node(fc, ff->kh, &parent); | ||
1806 | BUG_ON(*link); | ||
1807 | rb_link_node(&ff->polled_node, parent, link); | ||
1808 | rb_insert_color(&ff->polled_node, &fc->polled_files); | ||
1809 | } | ||
1810 | spin_unlock(&fc->lock); | ||
1811 | } | ||
1812 | |||
1813 | static unsigned fuse_file_poll(struct file *file, poll_table *wait) | ||
1814 | { | ||
1815 | struct inode *inode = file->f_dentry->d_inode; | ||
1816 | struct fuse_file *ff = file->private_data; | ||
1817 | struct fuse_conn *fc = get_fuse_conn(inode); | ||
1818 | struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; | ||
1819 | struct fuse_poll_out outarg; | ||
1820 | struct fuse_req *req; | ||
1821 | int err; | ||
1822 | |||
1823 | if (fc->no_poll) | ||
1824 | return DEFAULT_POLLMASK; | ||
1825 | |||
1826 | poll_wait(file, &ff->poll_wait, wait); | ||
1827 | |||
1828 | /* | ||
1829 | * Ask for notification iff there's someone waiting for it. | ||
1830 | * The client may ignore the flag and always notify. | ||
1831 | */ | ||
1832 | if (waitqueue_active(&ff->poll_wait)) { | ||
1833 | inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; | ||
1834 | fuse_register_polled_file(fc, ff); | ||
1835 | } | ||
1836 | |||
1837 | req = fuse_get_req(fc); | ||
1838 | if (IS_ERR(req)) | ||
1839 | return PTR_ERR(req); | ||
1840 | |||
1841 | req->in.h.opcode = FUSE_POLL; | ||
1842 | req->in.h.nodeid = get_node_id(inode); | ||
1843 | req->in.numargs = 1; | ||
1844 | req->in.args[0].size = sizeof(inarg); | ||
1845 | req->in.args[0].value = &inarg; | ||
1846 | req->out.numargs = 1; | ||
1847 | req->out.args[0].size = sizeof(outarg); | ||
1848 | req->out.args[0].value = &outarg; | ||
1849 | fuse_request_send(fc, req); | ||
1850 | err = req->out.h.error; | ||
1851 | fuse_put_request(fc, req); | ||
1852 | |||
1853 | if (!err) | ||
1854 | return outarg.revents; | ||
1855 | if (err == -ENOSYS) { | ||
1856 | fc->no_poll = 1; | ||
1857 | return DEFAULT_POLLMASK; | ||
1858 | } | ||
1859 | return POLLERR; | ||
1860 | } | ||
1861 | |||
1862 | /* | ||
1863 | * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and | ||
1864 | * wakes up the poll waiters. | ||
1865 | */ | ||
1866 | int fuse_notify_poll_wakeup(struct fuse_conn *fc, | ||
1867 | struct fuse_notify_poll_wakeup_out *outarg) | ||
1868 | { | ||
1869 | u64 kh = outarg->kh; | ||
1870 | struct rb_node **link; | ||
1871 | |||
1872 | spin_lock(&fc->lock); | ||
1873 | |||
1874 | link = fuse_find_polled_node(fc, kh, NULL); | ||
1875 | if (*link) { | ||
1876 | struct fuse_file *ff; | ||
1877 | |||
1878 | ff = rb_entry(*link, struct fuse_file, polled_node); | ||
1879 | wake_up_interruptible_sync(&ff->poll_wait); | ||
1880 | } | ||
1881 | |||
1882 | spin_unlock(&fc->lock); | ||
1883 | return 0; | ||
1884 | } | ||
1885 | |||
1473 | static const struct file_operations fuse_file_operations = { | 1886 | static const struct file_operations fuse_file_operations = { |
1474 | .llseek = fuse_file_llseek, | 1887 | .llseek = fuse_file_llseek, |
1475 | .read = do_sync_read, | 1888 | .read = do_sync_read, |
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = { | |||
1484 | .lock = fuse_file_lock, | 1897 | .lock = fuse_file_lock, |
1485 | .flock = fuse_file_flock, | 1898 | .flock = fuse_file_flock, |
1486 | .splice_read = generic_file_splice_read, | 1899 | .splice_read = generic_file_splice_read, |
1900 | .unlocked_ioctl = fuse_file_ioctl, | ||
1901 | .compat_ioctl = fuse_file_compat_ioctl, | ||
1902 | .poll = fuse_file_poll, | ||
1487 | }; | 1903 | }; |
1488 | 1904 | ||
1489 | static const struct file_operations fuse_direct_io_file_operations = { | 1905 | static const struct file_operations fuse_direct_io_file_operations = { |
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = { | |||
1496 | .fsync = fuse_fsync, | 1912 | .fsync = fuse_fsync, |
1497 | .lock = fuse_file_lock, | 1913 | .lock = fuse_file_lock, |
1498 | .flock = fuse_file_flock, | 1914 | .flock = fuse_file_flock, |
1915 | .unlocked_ioctl = fuse_file_ioctl, | ||
1916 | .compat_ioctl = fuse_file_compat_ioctl, | ||
1917 | .poll = fuse_file_poll, | ||
1499 | /* no mmap and splice_read */ | 1918 | /* no mmap and splice_read */ |
1500 | }; | 1919 | }; |
1501 | 1920 | ||
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 35accfdd747f..5e64b815a5a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -19,6 +19,8 @@ | |||
19 | #include <linux/backing-dev.h> | 19 | #include <linux/backing-dev.h> |
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include <linux/rwsem.h> | 21 | #include <linux/rwsem.h> |
22 | #include <linux/rbtree.h> | ||
23 | #include <linux/poll.h> | ||
22 | 24 | ||
23 | /** Max number of pages that can be used in a single read request */ | 25 | /** Max number of pages that can be used in a single read request */ |
24 | #define FUSE_MAX_PAGES_PER_REQ 32 | 26 | #define FUSE_MAX_PAGES_PER_REQ 32 |
@@ -100,6 +102,9 @@ struct fuse_file { | |||
100 | /** Request reserved for flush and release */ | 102 | /** Request reserved for flush and release */ |
101 | struct fuse_req *reserved_req; | 103 | struct fuse_req *reserved_req; |
102 | 104 | ||
105 | /** Kernel file handle guaranteed to be unique */ | ||
106 | u64 kh; | ||
107 | |||
103 | /** File handle used by userspace */ | 108 | /** File handle used by userspace */ |
104 | u64 fh; | 109 | u64 fh; |
105 | 110 | ||
@@ -108,6 +113,12 @@ struct fuse_file { | |||
108 | 113 | ||
109 | /** Entry on inode's write_files list */ | 114 | /** Entry on inode's write_files list */ |
110 | struct list_head write_entry; | 115 | struct list_head write_entry; |
116 | |||
117 | /** RB node to be linked on fuse_conn->polled_files */ | ||
118 | struct rb_node polled_node; | ||
119 | |||
120 | /** Wait queue head for poll */ | ||
121 | wait_queue_head_t poll_wait; | ||
111 | }; | 122 | }; |
112 | 123 | ||
113 | /** One input argument of a request */ | 124 | /** One input argument of a request */ |
@@ -322,6 +333,12 @@ struct fuse_conn { | |||
322 | /** The list of requests under I/O */ | 333 | /** The list of requests under I/O */ |
323 | struct list_head io; | 334 | struct list_head io; |
324 | 335 | ||
336 | /** The next unique kernel file handle */ | ||
337 | u64 khctr; | ||
338 | |||
339 | /** rbtree of fuse_files waiting for poll events indexed by ph */ | ||
340 | struct rb_root polled_files; | ||
341 | |||
325 | /** Number of requests currently in the background */ | 342 | /** Number of requests currently in the background */ |
326 | unsigned num_background; | 343 | unsigned num_background; |
327 | 344 | ||
@@ -355,19 +372,19 @@ struct fuse_conn { | |||
355 | /** Connection failed (version mismatch). Cannot race with | 372 | /** Connection failed (version mismatch). Cannot race with |
356 | setting other bitfields since it is only set once in INIT | 373 | setting other bitfields since it is only set once in INIT |
357 | reply, before any other request, and never cleared */ | 374 | reply, before any other request, and never cleared */ |
358 | unsigned conn_error : 1; | 375 | unsigned conn_error:1; |
359 | 376 | ||
360 | /** Connection successful. Only set in INIT */ | 377 | /** Connection successful. Only set in INIT */ |
361 | unsigned conn_init : 1; | 378 | unsigned conn_init:1; |
362 | 379 | ||
363 | /** Do readpages asynchronously? Only set in INIT */ | 380 | /** Do readpages asynchronously? Only set in INIT */ |
364 | unsigned async_read : 1; | 381 | unsigned async_read:1; |
365 | 382 | ||
366 | /** Do not send separate SETATTR request before open(O_TRUNC) */ | 383 | /** Do not send separate SETATTR request before open(O_TRUNC) */ |
367 | unsigned atomic_o_trunc : 1; | 384 | unsigned atomic_o_trunc:1; |
368 | 385 | ||
369 | /** Filesystem supports NFS exporting. Only set in INIT */ | 386 | /** Filesystem supports NFS exporting. Only set in INIT */ |
370 | unsigned export_support : 1; | 387 | unsigned export_support:1; |
371 | 388 | ||
372 | /* | 389 | /* |
373 | * The following bitfields are only for optimization purposes | 390 | * The following bitfields are only for optimization purposes |
@@ -375,43 +392,46 @@ struct fuse_conn { | |||
375 | */ | 392 | */ |
376 | 393 | ||
377 | /** Is fsync not implemented by fs? */ | 394 | /** Is fsync not implemented by fs? */ |
378 | unsigned no_fsync : 1; | 395 | unsigned no_fsync:1; |
379 | 396 | ||
380 | /** Is fsyncdir not implemented by fs? */ | 397 | /** Is fsyncdir not implemented by fs? */ |
381 | unsigned no_fsyncdir : 1; | 398 | unsigned no_fsyncdir:1; |
382 | 399 | ||
383 | /** Is flush not implemented by fs? */ | 400 | /** Is flush not implemented by fs? */ |
384 | unsigned no_flush : 1; | 401 | unsigned no_flush:1; |
385 | 402 | ||
386 | /** Is setxattr not implemented by fs? */ | 403 | /** Is setxattr not implemented by fs? */ |
387 | unsigned no_setxattr : 1; | 404 | unsigned no_setxattr:1; |
388 | 405 | ||
389 | /** Is getxattr not implemented by fs? */ | 406 | /** Is getxattr not implemented by fs? */ |
390 | unsigned no_getxattr : 1; | 407 | unsigned no_getxattr:1; |
391 | 408 | ||
392 | /** Is listxattr not implemented by fs? */ | 409 | /** Is listxattr not implemented by fs? */ |
393 | unsigned no_listxattr : 1; | 410 | unsigned no_listxattr:1; |
394 | 411 | ||
395 | /** Is removexattr not implemented by fs? */ | 412 | /** Is removexattr not implemented by fs? */ |
396 | unsigned no_removexattr : 1; | 413 | unsigned no_removexattr:1; |
397 | 414 | ||
398 | /** Are file locking primitives not implemented by fs? */ | 415 | /** Are file locking primitives not implemented by fs? */ |
399 | unsigned no_lock : 1; | 416 | unsigned no_lock:1; |
400 | 417 | ||
401 | /** Is access not implemented by fs? */ | 418 | /** Is access not implemented by fs? */ |
402 | unsigned no_access : 1; | 419 | unsigned no_access:1; |
403 | 420 | ||
404 | /** Is create not implemented by fs? */ | 421 | /** Is create not implemented by fs? */ |
405 | unsigned no_create : 1; | 422 | unsigned no_create:1; |
406 | 423 | ||
407 | /** Is interrupt not implemented by fs? */ | 424 | /** Is interrupt not implemented by fs? */ |
408 | unsigned no_interrupt : 1; | 425 | unsigned no_interrupt:1; |
409 | 426 | ||
410 | /** Is bmap not implemented by fs? */ | 427 | /** Is bmap not implemented by fs? */ |
411 | unsigned no_bmap : 1; | 428 | unsigned no_bmap:1; |
429 | |||
430 | /** Is poll not implemented by fs? */ | ||
431 | unsigned no_poll:1; | ||
412 | 432 | ||
413 | /** Do multi-page cached writes */ | 433 | /** Do multi-page cached writes */ |
414 | unsigned big_writes : 1; | 434 | unsigned big_writes:1; |
415 | 435 | ||
416 | /** The number of requests waiting for completion */ | 436 | /** The number of requests waiting for completion */ |
417 | atomic_t num_waiting; | 437 | atomic_t num_waiting; |
@@ -445,6 +465,9 @@ struct fuse_conn { | |||
445 | 465 | ||
446 | /** Version counter for attribute changes */ | 466 | /** Version counter for attribute changes */ |
447 | u64 attr_version; | 467 | u64 attr_version; |
468 | |||
469 | /** Called on final put */ | ||
470 | void (*release)(struct fuse_conn *); | ||
448 | }; | 471 | }; |
449 | 472 | ||
450 | static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) | 473 | static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) |
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, | |||
499 | */ | 522 | */ |
500 | int fuse_open_common(struct inode *inode, struct file *file, int isdir); | 523 | int fuse_open_common(struct inode *inode, struct file *file, int isdir); |
501 | 524 | ||
502 | struct fuse_file *fuse_file_alloc(void); | 525 | struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); |
503 | void fuse_file_free(struct fuse_file *ff); | 526 | void fuse_file_free(struct fuse_file *ff); |
504 | void fuse_finish_open(struct inode *inode, struct file *file, | 527 | void fuse_finish_open(struct inode *inode, struct file *file, |
505 | struct fuse_file *ff, struct fuse_open_out *outarg); | 528 | struct fuse_file *ff, struct fuse_open_out *outarg); |
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, | |||
519 | int isdir); | 542 | int isdir); |
520 | 543 | ||
521 | /** | 544 | /** |
545 | * Notify poll wakeup | ||
546 | */ | ||
547 | int fuse_notify_poll_wakeup(struct fuse_conn *fc, | ||
548 | struct fuse_notify_poll_wakeup_out *outarg); | ||
549 | |||
550 | /** | ||
522 | * Initialize file operations on a regular file | 551 | * Initialize file operations on a regular file |
523 | */ | 552 | */ |
524 | void fuse_init_file_inode(struct inode *inode); | 553 | void fuse_init_file_inode(struct inode *inode); |
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); | |||
593 | /** | 622 | /** |
594 | * Send a request (synchronous) | 623 | * Send a request (synchronous) |
595 | */ | 624 | */ |
596 | void request_send(struct fuse_conn *fc, struct fuse_req *req); | 625 | void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); |
597 | 626 | ||
598 | /** | 627 | /** |
599 | * Send a request with no reply | 628 | * Send a request with no reply |
600 | */ | 629 | */ |
601 | void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); | 630 | void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); |
602 | 631 | ||
603 | /** | 632 | /** |
604 | * Send a request in the background | 633 | * Send a request in the background |
605 | */ | 634 | */ |
606 | void request_send_background(struct fuse_conn *fc, struct fuse_req *req); | 635 | void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); |
607 | 636 | ||
608 | void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); | 637 | void fuse_request_send_background_locked(struct fuse_conn *fc, |
638 | struct fuse_req *req); | ||
609 | 639 | ||
610 | /* Abort all requests */ | 640 | /* Abort all requests */ |
611 | void fuse_abort_conn(struct fuse_conn *fc); | 641 | void fuse_abort_conn(struct fuse_conn *fc); |
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry); | |||
623 | struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); | 653 | struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); |
624 | 654 | ||
625 | /** | 655 | /** |
656 | * Initialize fuse_conn | ||
657 | */ | ||
658 | int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); | ||
659 | |||
660 | /** | ||
626 | * Release reference to fuse_conn | 661 | * Release reference to fuse_conn |
627 | */ | 662 | */ |
628 | void fuse_conn_put(struct fuse_conn *fc); | 663 | void fuse_conn_put(struct fuse_conn *fc); |
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2e99f34b4435..47c96fdca1ac 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | FUSE: Filesystem in Userspace | 2 | FUSE: Filesystem in Userspace |
3 | Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> | 3 | Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> |
4 | 4 | ||
5 | This program can be distributed under the terms of the GNU GPL. | 5 | This program can be distributed under the terms of the GNU GPL. |
6 | See the file COPYING. | 6 | See the file COPYING. |
@@ -37,10 +37,10 @@ struct fuse_mount_data { | |||
37 | unsigned rootmode; | 37 | unsigned rootmode; |
38 | unsigned user_id; | 38 | unsigned user_id; |
39 | unsigned group_id; | 39 | unsigned group_id; |
40 | unsigned fd_present : 1; | 40 | unsigned fd_present:1; |
41 | unsigned rootmode_present : 1; | 41 | unsigned rootmode_present:1; |
42 | unsigned user_id_present : 1; | 42 | unsigned user_id_present:1; |
43 | unsigned group_id_present : 1; | 43 | unsigned group_id_present:1; |
44 | unsigned flags; | 44 | unsigned flags; |
45 | unsigned max_read; | 45 | unsigned max_read; |
46 | unsigned blksize; | 46 | unsigned blksize; |
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, | |||
94 | req->in.numargs = 1; | 94 | req->in.numargs = 1; |
95 | req->in.args[0].size = sizeof(struct fuse_forget_in); | 95 | req->in.args[0].size = sizeof(struct fuse_forget_in); |
96 | req->in.args[0].value = inarg; | 96 | req->in.args[0].value = inarg; |
97 | request_send_noreply(fc, req); | 97 | fuse_request_send_noreply(fc, req); |
98 | } | 98 | } |
99 | 99 | ||
100 | static void fuse_clear_inode(struct inode *inode) | 100 | static void fuse_clear_inode(struct inode *inode) |
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, | |||
250 | 250 | ||
251 | fi = get_fuse_inode(inode); | 251 | fi = get_fuse_inode(inode); |
252 | spin_lock(&fc->lock); | 252 | spin_lock(&fc->lock); |
253 | fi->nlookup ++; | 253 | fi->nlookup++; |
254 | spin_unlock(&fc->lock); | 254 | spin_unlock(&fc->lock); |
255 | fuse_change_attributes(inode, attr, attr_valid, attr_version); | 255 | fuse_change_attributes(inode, attr, attr_valid, attr_version); |
256 | 256 | ||
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc) | |||
269 | fc->destroy_req = NULL; | 269 | fc->destroy_req = NULL; |
270 | req->in.h.opcode = FUSE_DESTROY; | 270 | req->in.h.opcode = FUSE_DESTROY; |
271 | req->force = 1; | 271 | req->force = 1; |
272 | request_send(fc, req); | 272 | fuse_request_send(fc, req); |
273 | fuse_put_request(fc, req); | 273 | fuse_put_request(fc, req); |
274 | } | 274 | } |
275 | } | 275 | } |
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
334 | req->out.args[0].size = | 334 | req->out.args[0].size = |
335 | fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); | 335 | fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); |
336 | req->out.args[0].value = &outarg; | 336 | req->out.args[0].value = &outarg; |
337 | request_send(fc, req); | 337 | fuse_request_send(fc, req); |
338 | err = req->out.h.error; | 338 | err = req->out.h.error; |
339 | if (!err) | 339 | if (!err) |
340 | convert_fuse_statfs(buf, &outarg.st); | 340 | convert_fuse_statfs(buf, &outarg.st); |
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
462 | return 0; | 462 | return 0; |
463 | } | 463 | } |
464 | 464 | ||
465 | static struct fuse_conn *new_conn(struct super_block *sb) | 465 | int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) |
466 | { | 466 | { |
467 | struct fuse_conn *fc; | ||
468 | int err; | 467 | int err; |
469 | 468 | ||
470 | fc = kzalloc(sizeof(*fc), GFP_KERNEL); | 469 | memset(fc, 0, sizeof(*fc)); |
471 | if (fc) { | 470 | spin_lock_init(&fc->lock); |
472 | spin_lock_init(&fc->lock); | 471 | mutex_init(&fc->inst_mutex); |
473 | mutex_init(&fc->inst_mutex); | 472 | atomic_set(&fc->count, 1); |
474 | atomic_set(&fc->count, 1); | 473 | init_waitqueue_head(&fc->waitq); |
475 | init_waitqueue_head(&fc->waitq); | 474 | init_waitqueue_head(&fc->blocked_waitq); |
476 | init_waitqueue_head(&fc->blocked_waitq); | 475 | init_waitqueue_head(&fc->reserved_req_waitq); |
477 | init_waitqueue_head(&fc->reserved_req_waitq); | 476 | INIT_LIST_HEAD(&fc->pending); |
478 | INIT_LIST_HEAD(&fc->pending); | 477 | INIT_LIST_HEAD(&fc->processing); |
479 | INIT_LIST_HEAD(&fc->processing); | 478 | INIT_LIST_HEAD(&fc->io); |
480 | INIT_LIST_HEAD(&fc->io); | 479 | INIT_LIST_HEAD(&fc->interrupts); |
481 | INIT_LIST_HEAD(&fc->interrupts); | 480 | INIT_LIST_HEAD(&fc->bg_queue); |
482 | INIT_LIST_HEAD(&fc->bg_queue); | 481 | INIT_LIST_HEAD(&fc->entry); |
483 | atomic_set(&fc->num_waiting, 0); | 482 | atomic_set(&fc->num_waiting, 0); |
484 | fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 483 | fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
485 | fc->bdi.unplug_io_fn = default_unplug_io_fn; | 484 | fc->bdi.unplug_io_fn = default_unplug_io_fn; |
486 | /* fuse does it's own writeback accounting */ | 485 | /* fuse does it's own writeback accounting */ |
487 | fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; | 486 | fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; |
488 | fc->dev = sb->s_dev; | 487 | fc->khctr = 0; |
489 | err = bdi_init(&fc->bdi); | 488 | fc->polled_files = RB_ROOT; |
490 | if (err) | 489 | fc->dev = sb->s_dev; |
491 | goto error_kfree; | 490 | err = bdi_init(&fc->bdi); |
492 | if (sb->s_bdev) { | 491 | if (err) |
493 | err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", | 492 | goto error_mutex_destroy; |
494 | MAJOR(fc->dev), MINOR(fc->dev)); | 493 | if (sb->s_bdev) { |
495 | } else { | 494 | err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", |
496 | err = bdi_register_dev(&fc->bdi, fc->dev); | 495 | MAJOR(fc->dev), MINOR(fc->dev)); |
497 | } | 496 | } else { |
498 | if (err) | 497 | err = bdi_register_dev(&fc->bdi, fc->dev); |
499 | goto error_bdi_destroy; | ||
500 | /* | ||
501 | * For a single fuse filesystem use max 1% of dirty + | ||
502 | * writeback threshold. | ||
503 | * | ||
504 | * This gives about 1M of write buffer for memory maps on a | ||
505 | * machine with 1G and 10% dirty_ratio, which should be more | ||
506 | * than enough. | ||
507 | * | ||
508 | * Privileged users can raise it by writing to | ||
509 | * | ||
510 | * /sys/class/bdi/<bdi>/max_ratio | ||
511 | */ | ||
512 | bdi_set_max_ratio(&fc->bdi, 1); | ||
513 | fc->reqctr = 0; | ||
514 | fc->blocked = 1; | ||
515 | fc->attr_version = 1; | ||
516 | get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); | ||
517 | } | 498 | } |
518 | return fc; | 499 | if (err) |
500 | goto error_bdi_destroy; | ||
501 | /* | ||
502 | * For a single fuse filesystem use max 1% of dirty + | ||
503 | * writeback threshold. | ||
504 | * | ||
505 | * This gives about 1M of write buffer for memory maps on a | ||
506 | * machine with 1G and 10% dirty_ratio, which should be more | ||
507 | * than enough. | ||
508 | * | ||
509 | * Privileged users can raise it by writing to | ||
510 | * | ||
511 | * /sys/class/bdi/<bdi>/max_ratio | ||
512 | */ | ||
513 | bdi_set_max_ratio(&fc->bdi, 1); | ||
514 | fc->reqctr = 0; | ||
515 | fc->blocked = 1; | ||
516 | fc->attr_version = 1; | ||
517 | get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); | ||
519 | 518 | ||
520 | error_bdi_destroy: | 519 | return 0; |
520 | |||
521 | error_bdi_destroy: | ||
521 | bdi_destroy(&fc->bdi); | 522 | bdi_destroy(&fc->bdi); |
522 | error_kfree: | 523 | error_mutex_destroy: |
523 | mutex_destroy(&fc->inst_mutex); | 524 | mutex_destroy(&fc->inst_mutex); |
524 | kfree(fc); | 525 | return err; |
525 | return NULL; | ||
526 | } | 526 | } |
527 | EXPORT_SYMBOL_GPL(fuse_conn_init); | ||
527 | 528 | ||
528 | void fuse_conn_put(struct fuse_conn *fc) | 529 | void fuse_conn_put(struct fuse_conn *fc) |
529 | { | 530 | { |
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc) | |||
532 | fuse_request_free(fc->destroy_req); | 533 | fuse_request_free(fc->destroy_req); |
533 | mutex_destroy(&fc->inst_mutex); | 534 | mutex_destroy(&fc->inst_mutex); |
534 | bdi_destroy(&fc->bdi); | 535 | bdi_destroy(&fc->bdi); |
535 | kfree(fc); | 536 | fc->release(fc); |
536 | } | 537 | } |
537 | } | 538 | } |
538 | 539 | ||
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) | |||
542 | return fc; | 543 | return fc; |
543 | } | 544 | } |
544 | 545 | ||
545 | static struct inode *get_root_inode(struct super_block *sb, unsigned mode) | 546 | static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) |
546 | { | 547 | { |
547 | struct fuse_attr attr; | 548 | struct fuse_attr attr; |
548 | memset(&attr, 0, sizeof(attr)); | 549 | memset(&attr, 0, sizeof(attr)); |
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) | |||
553 | return fuse_iget(sb, 1, 0, &attr, 0, 0); | 554 | return fuse_iget(sb, 1, 0, &attr, 0, 0); |
554 | } | 555 | } |
555 | 556 | ||
556 | struct fuse_inode_handle | 557 | struct fuse_inode_handle { |
557 | { | ||
558 | u64 nodeid; | 558 | u64 nodeid; |
559 | u32 generation; | 559 | u32 generation; |
560 | }; | 560 | }; |
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) | |||
761 | fc->max_write = max_t(unsigned, 4096, fc->max_write); | 761 | fc->max_write = max_t(unsigned, 4096, fc->max_write); |
762 | fc->conn_init = 1; | 762 | fc->conn_init = 1; |
763 | } | 763 | } |
764 | fuse_put_request(fc, req); | ||
765 | fc->blocked = 0; | 764 | fc->blocked = 0; |
766 | wake_up_all(&fc->blocked_waitq); | 765 | wake_up_all(&fc->blocked_waitq); |
767 | } | 766 | } |
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) | |||
787 | req->out.args[0].size = sizeof(struct fuse_init_out); | 786 | req->out.args[0].size = sizeof(struct fuse_init_out); |
788 | req->out.args[0].value = &req->misc.init_out; | 787 | req->out.args[0].value = &req->misc.init_out; |
789 | req->end = process_init_reply; | 788 | req->end = process_init_reply; |
790 | request_send_background(fc, req); | 789 | fuse_request_send_background(fc, req); |
790 | } | ||
791 | |||
792 | static void fuse_free_conn(struct fuse_conn *fc) | ||
793 | { | ||
794 | kfree(fc); | ||
791 | } | 795 | } |
792 | 796 | ||
793 | static int fuse_fill_super(struct super_block *sb, void *data, int silent) | 797 | static int fuse_fill_super(struct super_block *sb, void *data, int silent) |
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) | |||
828 | if (file->f_op != &fuse_dev_operations) | 832 | if (file->f_op != &fuse_dev_operations) |
829 | return -EINVAL; | 833 | return -EINVAL; |
830 | 834 | ||
831 | fc = new_conn(sb); | 835 | fc = kmalloc(sizeof(*fc), GFP_KERNEL); |
832 | if (!fc) | 836 | if (!fc) |
833 | return -ENOMEM; | 837 | return -ENOMEM; |
834 | 838 | ||
839 | err = fuse_conn_init(fc, sb); | ||
840 | if (err) { | ||
841 | kfree(fc); | ||
842 | return err; | ||
843 | } | ||
844 | |||
845 | fc->release = fuse_free_conn; | ||
835 | fc->flags = d.flags; | 846 | fc->flags = d.flags; |
836 | fc->user_id = d.user_id; | 847 | fc->user_id = d.user_id; |
837 | fc->group_id = d.group_id; | 848 | fc->group_id = d.group_id; |
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) | |||
841 | sb->s_fs_info = fc; | 852 | sb->s_fs_info = fc; |
842 | 853 | ||
843 | err = -ENOMEM; | 854 | err = -ENOMEM; |
844 | root = get_root_inode(sb, d.rootmode); | 855 | root = fuse_get_root_inode(sb, d.rootmode); |
845 | if (!root) | 856 | if (!root) |
846 | goto err; | 857 | goto err; |
847 | 858 | ||
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void) | |||
952 | 963 | ||
953 | static void fuse_inode_init_once(void *foo) | 964 | static void fuse_inode_init_once(void *foo) |
954 | { | 965 | { |
955 | struct inode * inode = foo; | 966 | struct inode *inode = foo; |
956 | 967 | ||
957 | inode_init_once(inode); | 968 | inode_init_once(inode); |
958 | } | 969 | } |
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void) | |||
1031 | { | 1042 | { |
1032 | int res; | 1043 | int res; |
1033 | 1044 | ||
1034 | printk("fuse init (API version %i.%i)\n", | 1045 | printk(KERN_INFO "fuse init (API version %i.%i)\n", |
1035 | FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); | 1046 | FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); |
1036 | 1047 | ||
1037 | INIT_LIST_HEAD(&fuse_conn_list); | 1048 | INIT_LIST_HEAD(&fuse_conn_list); |
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index ab2f57e3fb87..e563a6449811 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config GFS2_FS | 1 | config GFS2_FS |
2 | tristate "GFS2 file system support" | 2 | tristate "GFS2 file system support" |
3 | depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) | 3 | depends on EXPERIMENTAL && (64BIT || LBD) |
4 | select FS_POSIX_ACL | 4 | select FS_POSIX_ACL |
5 | select CRC32 | 5 | select CRC32 |
6 | help | 6 | help |
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 6e4ea36c6605..4ddab67867eb 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c | |||
@@ -675,6 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, | |||
675 | goto out_trans_fail; | 675 | goto out_trans_fail; |
676 | 676 | ||
677 | error = -ENOMEM; | 677 | error = -ENOMEM; |
678 | flags |= AOP_FLAG_NOFS; | ||
678 | page = grab_cache_page_write_begin(mapping, index, flags); | 679 | page = grab_cache_page_write_begin(mapping, index, flags); |
679 | *pagep = page; | 680 | *pagep = page; |
680 | if (unlikely(!page)) | 681 | if (unlikely(!page)) |
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 289c5f54ba53..93fe41b67f97 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c | |||
@@ -342,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) | |||
342 | struct gfs2_inode *ip = GFS2_I(inode); | 342 | struct gfs2_inode *ip = GFS2_I(inode); |
343 | struct gfs2_sbd *sdp = GFS2_SB(inode); | 343 | struct gfs2_sbd *sdp = GFS2_SB(inode); |
344 | unsigned long last_index; | 344 | unsigned long last_index; |
345 | u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); | 345 | u64 pos = page->index << PAGE_CACHE_SHIFT; |
346 | unsigned int data_blocks, ind_blocks, rblocks; | 346 | unsigned int data_blocks, ind_blocks, rblocks; |
347 | int alloc_required = 0; | 347 | int alloc_required = 0; |
348 | struct gfs2_holder gh; | 348 | struct gfs2_holder gh; |
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 777783deddcb..320323d03479 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c | |||
@@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) | |||
211 | } | 211 | } |
212 | 212 | ||
213 | /** | 213 | /** |
214 | * gfs2_write_super_lockfs - prevent further writes to the filesystem | 214 | * gfs2_freeze - prevent further writes to the filesystem |
215 | * @sb: the VFS structure for the filesystem | 215 | * @sb: the VFS structure for the filesystem |
216 | * | 216 | * |
217 | */ | 217 | */ |
218 | 218 | ||
219 | static void gfs2_write_super_lockfs(struct super_block *sb) | 219 | static int gfs2_freeze(struct super_block *sb) |
220 | { | 220 | { |
221 | struct gfs2_sbd *sdp = sb->s_fs_info; | 221 | struct gfs2_sbd *sdp = sb->s_fs_info; |
222 | int error; | 222 | int error; |
223 | 223 | ||
224 | if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) | 224 | if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) |
225 | return; | 225 | return -EINVAL; |
226 | 226 | ||
227 | for (;;) { | 227 | for (;;) { |
228 | error = gfs2_freeze_fs(sdp); | 228 | error = gfs2_freeze_fs(sdp); |
@@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb) | |||
242 | fs_err(sdp, "retrying...\n"); | 242 | fs_err(sdp, "retrying...\n"); |
243 | msleep(1000); | 243 | msleep(1000); |
244 | } | 244 | } |
245 | return 0; | ||
245 | } | 246 | } |
246 | 247 | ||
247 | /** | 248 | /** |
248 | * gfs2_unlockfs - reallow writes to the filesystem | 249 | * gfs2_unfreeze - reallow writes to the filesystem |
249 | * @sb: the VFS structure for the filesystem | 250 | * @sb: the VFS structure for the filesystem |
250 | * | 251 | * |
251 | */ | 252 | */ |
252 | 253 | ||
253 | static void gfs2_unlockfs(struct super_block *sb) | 254 | static int gfs2_unfreeze(struct super_block *sb) |
254 | { | 255 | { |
255 | gfs2_unfreeze_fs(sb->s_fs_info); | 256 | gfs2_unfreeze_fs(sb->s_fs_info); |
257 | return 0; | ||
256 | } | 258 | } |
257 | 259 | ||
258 | /** | 260 | /** |
@@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = { | |||
688 | .put_super = gfs2_put_super, | 690 | .put_super = gfs2_put_super, |
689 | .write_super = gfs2_write_super, | 691 | .write_super = gfs2_write_super, |
690 | .sync_fs = gfs2_sync_fs, | 692 | .sync_fs = gfs2_sync_fs, |
691 | .write_super_lockfs = gfs2_write_super_lockfs, | 693 | .freeze_fs = gfs2_freeze, |
692 | .unlockfs = gfs2_unlockfs, | 694 | .unfreeze_fs = gfs2_unfreeze, |
693 | .statfs = gfs2_statfs, | 695 | .statfs = gfs2_statfs, |
694 | .remount_fs = gfs2_remount_fs, | 696 | .remount_fs = gfs2_remount_fs, |
695 | .clear_inode = gfs2_clear_inode, | 697 | .clear_inode = gfs2_clear_inode, |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0ab0c6f5f438..6903d37af037 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
252 | for (;;) { | 252 | for (;;) { |
253 | struct page *page; | 253 | struct page *page; |
254 | unsigned long nr, ret; | 254 | unsigned long nr, ret; |
255 | int ra; | ||
255 | 256 | ||
256 | /* nr is the maximum number of bytes to copy from this page */ | 257 | /* nr is the maximum number of bytes to copy from this page */ |
257 | nr = huge_page_size(h); | 258 | nr = huge_page_size(h); |
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, | |||
274 | */ | 275 | */ |
275 | ret = len < nr ? len : nr; | 276 | ret = len < nr ? len : nr; |
276 | if (clear_user(buf, ret)) | 277 | if (clear_user(buf, ret)) |
277 | ret = -EFAULT; | 278 | ra = -EFAULT; |
279 | else | ||
280 | ra = 0; | ||
278 | } else { | 281 | } else { |
279 | /* | 282 | /* |
280 | * We have the page, copy it to user space buffer. | 283 | * We have the page, copy it to user space buffer. |
281 | */ | 284 | */ |
282 | ret = hugetlbfs_read_actor(page, offset, buf, len, nr); | 285 | ra = hugetlbfs_read_actor(page, offset, buf, len, nr); |
286 | ret = ra; | ||
283 | } | 287 | } |
284 | if (ret < 0) { | 288 | if (ra < 0) { |
285 | if (retval == 0) | 289 | if (retval == 0) |
286 | retval = ret; | 290 | retval = ra; |
287 | if (page) | 291 | if (page) |
288 | page_cache_release(page); | 292 | page_cache_release(page); |
289 | goto out; | 293 | goto out; |
diff --git a/fs/inode.c b/fs/inode.c index bd48e5e6d3e8..913ab2d9a5d1 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/inotify.h> | 23 | #include <linux/inotify.h> |
24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
25 | #include <linux/async.h> | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * This is needed for the following functions: | 28 | * This is needed for the following functions: |
@@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode) | |||
110 | 111 | ||
111 | /** | 112 | /** |
112 | * inode_init_always - perform inode structure intialisation | 113 | * inode_init_always - perform inode structure intialisation |
113 | * @sb - superblock inode belongs to. | 114 | * @sb: superblock inode belongs to |
114 | * @inode - inode to initialise | 115 | * @inode: inode to initialise |
115 | * | 116 | * |
116 | * These are initializations that need to be done on every inode | 117 | * These are initializations that need to be done on every inode |
117 | * allocation as the fields are not initialised by slab allocation. | 118 | * allocation as the fields are not initialised by slab allocation. |
@@ -166,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) | |||
166 | mapping->a_ops = &empty_aops; | 167 | mapping->a_ops = &empty_aops; |
167 | mapping->host = inode; | 168 | mapping->host = inode; |
168 | mapping->flags = 0; | 169 | mapping->flags = 0; |
169 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); | 170 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); |
170 | mapping->assoc_mapping = NULL; | 171 | mapping->assoc_mapping = NULL; |
171 | mapping->backing_dev_info = &default_backing_dev_info; | 172 | mapping->backing_dev_info = &default_backing_dev_info; |
172 | mapping->writeback_index = 0; | 173 | mapping->writeback_index = 0; |
@@ -576,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, | |||
576 | 577 | ||
577 | /** | 578 | /** |
578 | * inode_add_to_lists - add a new inode to relevant lists | 579 | * inode_add_to_lists - add a new inode to relevant lists |
579 | * @sb - superblock inode belongs to. | 580 | * @sb: superblock inode belongs to |
580 | * @inode - inode to mark in use | 581 | * @inode: inode to mark in use |
581 | * | 582 | * |
582 | * When an inode is allocated it needs to be accounted for, added to the in use | 583 | * When an inode is allocated it needs to be accounted for, added to the in use |
583 | * list, the owning superblock and the inode hash. This needs to be done under | 584 | * list, the owning superblock and the inode hash. This needs to be done under |
@@ -601,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists); | |||
601 | * @sb: superblock | 602 | * @sb: superblock |
602 | * | 603 | * |
603 | * Allocates a new inode for given superblock. The default gfp_mask | 604 | * Allocates a new inode for given superblock. The default gfp_mask |
604 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. | 605 | * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. |
605 | * If HIGHMEM pages are unsuitable or it is known that pages allocated | 606 | * If HIGHMEM pages are unsuitable or it is known that pages allocated |
606 | * for the page cache are not reclaimable or migratable, | 607 | * for the page cache are not reclaimable or migratable, |
607 | * mapping_set_gfp_mask() must be called with suitable flags on the | 608 | * mapping_set_gfp_mask() must be called with suitable flags on the |
diff --git a/fs/ioctl.c b/fs/ioctl.c index cc3f1aa1cf7b..20b0a8a24c6b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c | |||
@@ -439,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, | |||
439 | return error; | 439 | return error; |
440 | } | 440 | } |
441 | 441 | ||
442 | static int ioctl_fsfreeze(struct file *filp) | ||
443 | { | ||
444 | struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; | ||
445 | |||
446 | if (!capable(CAP_SYS_ADMIN)) | ||
447 | return -EPERM; | ||
448 | |||
449 | /* If filesystem doesn't support freeze feature, return. */ | ||
450 | if (sb->s_op->freeze_fs == NULL) | ||
451 | return -EOPNOTSUPP; | ||
452 | |||
453 | /* If a blockdevice-backed filesystem isn't specified, return. */ | ||
454 | if (sb->s_bdev == NULL) | ||
455 | return -EINVAL; | ||
456 | |||
457 | /* Freeze */ | ||
458 | sb = freeze_bdev(sb->s_bdev); | ||
459 | if (IS_ERR(sb)) | ||
460 | return PTR_ERR(sb); | ||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | static int ioctl_fsthaw(struct file *filp) | ||
465 | { | ||
466 | struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; | ||
467 | |||
468 | if (!capable(CAP_SYS_ADMIN)) | ||
469 | return -EPERM; | ||
470 | |||
471 | /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ | ||
472 | if (sb->s_bdev == NULL) | ||
473 | return -EINVAL; | ||
474 | |||
475 | /* Thaw */ | ||
476 | return thaw_bdev(sb->s_bdev, sb); | ||
477 | } | ||
478 | |||
442 | /* | 479 | /* |
443 | * When you add any new common ioctls to the switches above and below | 480 | * When you add any new common ioctls to the switches above and below |
444 | * please update compat_sys_ioctl() too. | 481 | * please update compat_sys_ioctl() too. |
@@ -486,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, | |||
486 | } else | 523 | } else |
487 | error = -ENOTTY; | 524 | error = -ENOTTY; |
488 | break; | 525 | break; |
526 | |||
527 | case FIFREEZE: | ||
528 | error = ioctl_fsfreeze(filp); | ||
529 | break; | ||
530 | |||
531 | case FITHAW: | ||
532 | error = ioctl_fsthaw(filp); | ||
533 | break; | ||
534 | |||
489 | default: | 535 | default: |
490 | if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) | 536 | if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) |
491 | error = file_ioctl(filp, cmd, arg); | 537 | error = file_ioctl(filp, cmd, arg); |
diff --git a/fs/ioprio.c b/fs/ioprio.c index 3569e0ad86a2..1a39ac370942 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #include <linux/security.h> | 27 | #include <linux/security.h> |
28 | #include <linux/pid_namespace.h> | 28 | #include <linux/pid_namespace.h> |
29 | 29 | ||
30 | static int set_task_ioprio(struct task_struct *task, int ioprio) | 30 | int set_task_ioprio(struct task_struct *task, int ioprio) |
31 | { | 31 | { |
32 | int err; | 32 | int err; |
33 | struct io_context *ioc; | 33 | struct io_context *ioc; |
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) | |||
70 | task_unlock(task); | 70 | task_unlock(task); |
71 | return err; | 71 | return err; |
72 | } | 72 | } |
73 | EXPORT_SYMBOL_GPL(set_task_ioprio); | ||
73 | 74 | ||
74 | asmlinkage long sys_ioprio_set(int which, int who, int ioprio) | 75 | asmlinkage long sys_ioprio_set(int which, int who, int ioprio) |
75 | { | 76 | { |
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 25719d902c51..3fbffb1ea714 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c | |||
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal) | |||
306 | int flags; | 306 | int flags; |
307 | int err; | 307 | int err; |
308 | unsigned long blocknr; | 308 | unsigned long blocknr; |
309 | ktime_t start_time; | ||
310 | u64 commit_time; | ||
309 | char *tagp = NULL; | 311 | char *tagp = NULL; |
310 | journal_header_t *header; | 312 | journal_header_t *header; |
311 | journal_block_tag_t *tag = NULL; | 313 | journal_block_tag_t *tag = NULL; |
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal) | |||
418 | commit_transaction->t_state = T_FLUSH; | 420 | commit_transaction->t_state = T_FLUSH; |
419 | journal->j_committing_transaction = commit_transaction; | 421 | journal->j_committing_transaction = commit_transaction; |
420 | journal->j_running_transaction = NULL; | 422 | journal->j_running_transaction = NULL; |
423 | start_time = ktime_get(); | ||
421 | commit_transaction->t_log_start = journal->j_head; | 424 | commit_transaction->t_log_start = journal->j_head; |
422 | wake_up(&journal->j_wait_transaction_locked); | 425 | wake_up(&journal->j_wait_transaction_locked); |
423 | spin_unlock(&journal->j_state_lock); | 426 | spin_unlock(&journal->j_state_lock); |
@@ -913,6 +916,18 @@ restart_loop: | |||
913 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 916 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
914 | journal->j_commit_sequence = commit_transaction->t_tid; | 917 | journal->j_commit_sequence = commit_transaction->t_tid; |
915 | journal->j_committing_transaction = NULL; | 918 | journal->j_committing_transaction = NULL; |
919 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | ||
920 | |||
921 | /* | ||
922 | * weight the commit time higher than the average time so we don't | ||
923 | * react too strongly to vast changes in commit time | ||
924 | */ | ||
925 | if (likely(journal->j_average_commit_time)) | ||
926 | journal->j_average_commit_time = (commit_time*3 + | ||
927 | journal->j_average_commit_time) / 4; | ||
928 | else | ||
929 | journal->j_average_commit_time = commit_time; | ||
930 | |||
916 | spin_unlock(&journal->j_state_lock); | 931 | spin_unlock(&journal->j_state_lock); |
917 | 932 | ||
918 | if (commit_transaction->t_checkpoint_list == NULL && | 933 | if (commit_transaction->t_checkpoint_list == NULL && |
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d4c32c8808..e6a117431277 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/timer.h> | 25 | #include <linux/timer.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/hrtimer.h> | ||
28 | 29 | ||
29 | static void __journal_temp_unlink_buffer(struct journal_head *jh); | 30 | static void __journal_temp_unlink_buffer(struct journal_head *jh); |
30 | 31 | ||
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) | |||
49 | { | 50 | { |
50 | transaction->t_journal = journal; | 51 | transaction->t_journal = journal; |
51 | transaction->t_state = T_RUNNING; | 52 | transaction->t_state = T_RUNNING; |
53 | transaction->t_start_time = ktime_get(); | ||
52 | transaction->t_tid = journal->j_transaction_sequence++; | 54 | transaction->t_tid = journal->j_transaction_sequence++; |
53 | transaction->t_expires = jiffies + journal->j_commit_interval; | 55 | transaction->t_expires = jiffies + journal->j_commit_interval; |
54 | spin_lock_init(&transaction->t_handle_lock); | 56 | spin_lock_init(&transaction->t_handle_lock); |
@@ -752,7 +754,6 @@ out: | |||
752 | * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. | 754 | * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. |
753 | * @handle: transaction to add buffer modifications to | 755 | * @handle: transaction to add buffer modifications to |
754 | * @bh: bh to be used for metadata writes | 756 | * @bh: bh to be used for metadata writes |
755 | * @credits: variable that will receive credits for the buffer | ||
756 | * | 757 | * |
757 | * Returns an error code or 0 on success. | 758 | * Returns an error code or 0 on success. |
758 | * | 759 | * |
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle) | |||
1370 | { | 1371 | { |
1371 | transaction_t *transaction = handle->h_transaction; | 1372 | transaction_t *transaction = handle->h_transaction; |
1372 | journal_t *journal = transaction->t_journal; | 1373 | journal_t *journal = transaction->t_journal; |
1373 | int old_handle_count, err; | 1374 | int err; |
1374 | pid_t pid; | 1375 | pid_t pid; |
1375 | 1376 | ||
1376 | J_ASSERT(journal_current_handle() == handle); | 1377 | J_ASSERT(journal_current_handle() == handle); |
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle) | |||
1399 | * on IO anyway. Speeds up many-threaded, many-dir operations | 1400 | * on IO anyway. Speeds up many-threaded, many-dir operations |
1400 | * by 30x or more... | 1401 | * by 30x or more... |
1401 | * | 1402 | * |
1403 | * We try and optimize the sleep time against what the underlying disk | ||
1404 | * can do, instead of having a static sleep time. This is usefull for | ||
1405 | * the case where our storage is so fast that it is more optimal to go | ||
1406 | * ahead and force a flush and wait for the transaction to be committed | ||
1407 | * than it is to wait for an arbitrary amount of time for new writers to | ||
1408 | * join the transaction. We acheive this by measuring how long it takes | ||
1409 | * to commit a transaction, and compare it with how long this | ||
1410 | * transaction has been running, and if run time < commit time then we | ||
1411 | * sleep for the delta and commit. This greatly helps super fast disks | ||
1412 | * that would see slowdowns as more threads started doing fsyncs. | ||
1413 | * | ||
1402 | * But don't do this if this process was the most recent one to | 1414 | * But don't do this if this process was the most recent one to |
1403 | * perform a synchronous write. We do this to detect the case where a | 1415 | * perform a synchronous write. We do this to detect the case where a |
1404 | * single process is doing a stream of sync writes. No point in waiting | 1416 | * single process is doing a stream of sync writes. No point in waiting |
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle) | |||
1406 | */ | 1418 | */ |
1407 | pid = current->pid; | 1419 | pid = current->pid; |
1408 | if (handle->h_sync && journal->j_last_sync_writer != pid) { | 1420 | if (handle->h_sync && journal->j_last_sync_writer != pid) { |
1421 | u64 commit_time, trans_time; | ||
1422 | |||
1409 | journal->j_last_sync_writer = pid; | 1423 | journal->j_last_sync_writer = pid; |
1410 | do { | 1424 | |
1411 | old_handle_count = transaction->t_handle_count; | 1425 | spin_lock(&journal->j_state_lock); |
1412 | schedule_timeout_uninterruptible(1); | 1426 | commit_time = journal->j_average_commit_time; |
1413 | } while (old_handle_count != transaction->t_handle_count); | 1427 | spin_unlock(&journal->j_state_lock); |
1428 | |||
1429 | trans_time = ktime_to_ns(ktime_sub(ktime_get(), | ||
1430 | transaction->t_start_time)); | ||
1431 | |||
1432 | commit_time = min_t(u64, commit_time, | ||
1433 | 1000*jiffies_to_usecs(1)); | ||
1434 | |||
1435 | if (trans_time < commit_time) { | ||
1436 | ktime_t expires = ktime_add_ns(ktime_get(), | ||
1437 | commit_time); | ||
1438 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1439 | schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); | ||
1440 | } | ||
1414 | } | 1441 | } |
1415 | 1442 | ||
1416 | current->journal_info = NULL; | 1443 | current->journal_info = NULL; |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9497718fe920..17159cacbd9e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -249,16 +249,14 @@ restart: | |||
249 | return ret; | 249 | return ret; |
250 | } | 250 | } |
251 | 251 | ||
252 | #define NR_BATCH 64 | ||
253 | |||
254 | static void | 252 | static void |
255 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | 253 | __flush_batch(journal_t *journal, int *batch_count) |
256 | { | 254 | { |
257 | int i; | 255 | int i; |
258 | 256 | ||
259 | ll_rw_block(SWRITE, *batch_count, bhs); | 257 | ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); |
260 | for (i = 0; i < *batch_count; i++) { | 258 | for (i = 0; i < *batch_count; i++) { |
261 | struct buffer_head *bh = bhs[i]; | 259 | struct buffer_head *bh = journal->j_chkpt_bhs[i]; |
262 | clear_buffer_jwrite(bh); | 260 | clear_buffer_jwrite(bh); |
263 | BUFFER_TRACE(bh, "brelse"); | 261 | BUFFER_TRACE(bh, "brelse"); |
264 | __brelse(bh); | 262 | __brelse(bh); |
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | |||
277 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 275 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
278 | */ | 276 | */ |
279 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | 277 | static int __process_buffer(journal_t *journal, struct journal_head *jh, |
280 | struct buffer_head **bhs, int *batch_count, | 278 | int *batch_count, transaction_t *transaction) |
281 | transaction_t *transaction) | ||
282 | { | 279 | { |
283 | struct buffer_head *bh = jh2bh(jh); | 280 | struct buffer_head *bh = jh2bh(jh); |
284 | int ret = 0; | 281 | int ret = 0; |
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, | |||
325 | get_bh(bh); | 322 | get_bh(bh); |
326 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | 323 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); |
327 | set_buffer_jwrite(bh); | 324 | set_buffer_jwrite(bh); |
328 | bhs[*batch_count] = bh; | 325 | journal->j_chkpt_bhs[*batch_count] = bh; |
329 | __buffer_relink_io(jh); | 326 | __buffer_relink_io(jh); |
330 | jbd_unlock_bh_state(bh); | 327 | jbd_unlock_bh_state(bh); |
331 | transaction->t_chp_stats.cs_written++; | 328 | transaction->t_chp_stats.cs_written++; |
332 | (*batch_count)++; | 329 | (*batch_count)++; |
333 | if (*batch_count == NR_BATCH) { | 330 | if (*batch_count == JBD2_NR_BATCH) { |
334 | spin_unlock(&journal->j_list_lock); | 331 | spin_unlock(&journal->j_list_lock); |
335 | __flush_batch(journal, bhs, batch_count); | 332 | __flush_batch(journal, batch_count); |
336 | ret = 1; | 333 | ret = 1; |
337 | } | 334 | } |
338 | } | 335 | } |
@@ -388,7 +385,6 @@ restart: | |||
388 | if (journal->j_checkpoint_transactions == transaction && | 385 | if (journal->j_checkpoint_transactions == transaction && |
389 | transaction->t_tid == this_tid) { | 386 | transaction->t_tid == this_tid) { |
390 | int batch_count = 0; | 387 | int batch_count = 0; |
391 | struct buffer_head *bhs[NR_BATCH]; | ||
392 | struct journal_head *jh; | 388 | struct journal_head *jh; |
393 | int retry = 0, err; | 389 | int retry = 0, err; |
394 | 390 | ||
@@ -402,7 +398,7 @@ restart: | |||
402 | retry = 1; | 398 | retry = 1; |
403 | break; | 399 | break; |
404 | } | 400 | } |
405 | retry = __process_buffer(journal, jh, bhs, &batch_count, | 401 | retry = __process_buffer(journal, jh, &batch_count, |
406 | transaction); | 402 | transaction); |
407 | if (retry < 0 && !result) | 403 | if (retry < 0 && !result) |
408 | result = retry; | 404 | result = retry; |
@@ -419,7 +415,7 @@ restart: | |||
419 | spin_unlock(&journal->j_list_lock); | 415 | spin_unlock(&journal->j_list_lock); |
420 | retry = 1; | 416 | retry = 1; |
421 | } | 417 | } |
422 | __flush_batch(journal, bhs, &batch_count); | 418 | __flush_batch(journal, &batch_count); |
423 | } | 419 | } |
424 | 420 | ||
425 | if (retry) { | 421 | if (retry) { |
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) | |||
686 | safely remove this transaction from the log */ | 682 | safely remove this transaction from the log */ |
687 | 683 | ||
688 | __jbd2_journal_drop_transaction(journal, transaction); | 684 | __jbd2_journal_drop_transaction(journal, transaction); |
685 | kfree(transaction); | ||
689 | 686 | ||
690 | /* Just in case anybody was waiting for more transactions to be | 687 | /* Just in case anybody was waiting for more transactions to be |
691 | checkpointed... */ | 688 | checkpointed... */ |
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact | |||
760 | J_ASSERT(journal->j_running_transaction != transaction); | 757 | J_ASSERT(journal->j_running_transaction != transaction); |
761 | 758 | ||
762 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | 759 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); |
763 | kfree(transaction); | ||
764 | } | 760 | } |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index c8a1bace685a..62804e57a44c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/crc32.h> | 25 | #include <linux/crc32.h> |
26 | #include <linux/writeback.h> | 26 | #include <linux/writeback.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/bio.h> | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Default IO end handler for temporary BJ_IO buffer_heads. | 31 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
137 | set_buffer_ordered(bh); | 138 | set_buffer_ordered(bh); |
138 | barrier_done = 1; | 139 | barrier_done = 1; |
139 | } | 140 | } |
140 | ret = submit_bh(WRITE, bh); | 141 | ret = submit_bh(WRITE_SYNC, bh); |
141 | if (barrier_done) | 142 | if (barrier_done) |
142 | clear_buffer_ordered(bh); | 143 | clear_buffer_ordered(bh); |
143 | 144 | ||
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
158 | lock_buffer(bh); | 159 | lock_buffer(bh); |
159 | set_buffer_uptodate(bh); | 160 | set_buffer_uptodate(bh); |
160 | clear_buffer_dirty(bh); | 161 | clear_buffer_dirty(bh); |
161 | ret = submit_bh(WRITE, bh); | 162 | ret = submit_bh(WRITE_SYNC, bh); |
162 | } | 163 | } |
163 | *cbh = bh; | 164 | *cbh = bh; |
164 | return ret; | 165 | return ret; |
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal, | |||
168 | * This function along with journal_submit_commit_record | 169 | * This function along with journal_submit_commit_record |
169 | * allows to write the commit record asynchronously. | 170 | * allows to write the commit record asynchronously. |
170 | */ | 171 | */ |
171 | static int journal_wait_on_commit_record(struct buffer_head *bh) | 172 | static int journal_wait_on_commit_record(journal_t *journal, |
173 | struct buffer_head *bh) | ||
172 | { | 174 | { |
173 | int ret = 0; | 175 | int ret = 0; |
174 | 176 | ||
177 | retry: | ||
175 | clear_buffer_dirty(bh); | 178 | clear_buffer_dirty(bh); |
176 | wait_on_buffer(bh); | 179 | wait_on_buffer(bh); |
180 | if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { | ||
181 | printk(KERN_WARNING | ||
182 | "JBD2: wait_on_commit_record: sync failed on %s - " | ||
183 | "disabling barriers\n", journal->j_devname); | ||
184 | spin_lock(&journal->j_state_lock); | ||
185 | journal->j_flags &= ~JBD2_BARRIER; | ||
186 | spin_unlock(&journal->j_state_lock); | ||
187 | |||
188 | lock_buffer(bh); | ||
189 | clear_buffer_dirty(bh); | ||
190 | set_buffer_uptodate(bh); | ||
191 | bh->b_end_io = journal_end_buffer_io_sync; | ||
192 | |||
193 | ret = submit_bh(WRITE_SYNC, bh); | ||
194 | if (ret) { | ||
195 | unlock_buffer(bh); | ||
196 | return ret; | ||
197 | } | ||
198 | goto retry; | ||
199 | } | ||
177 | 200 | ||
178 | if (unlikely(!buffer_uptodate(bh))) | 201 | if (unlikely(!buffer_uptodate(bh))) |
179 | ret = -EIO; | 202 | ret = -EIO; |
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
332 | int flags; | 355 | int flags; |
333 | int err; | 356 | int err; |
334 | unsigned long long blocknr; | 357 | unsigned long long blocknr; |
358 | ktime_t start_time; | ||
359 | u64 commit_time; | ||
335 | char *tagp = NULL; | 360 | char *tagp = NULL; |
336 | journal_header_t *header; | 361 | journal_header_t *header; |
337 | journal_block_tag_t *tag = NULL; | 362 | journal_block_tag_t *tag = NULL; |
338 | int space_left = 0; | 363 | int space_left = 0; |
339 | int first_tag = 0; | 364 | int first_tag = 0; |
340 | int tag_flag; | 365 | int tag_flag; |
341 | int i; | 366 | int i, to_free = 0; |
342 | int tag_bytes = journal_tag_bytes(journal); | 367 | int tag_bytes = journal_tag_bytes(journal); |
343 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | 368 | struct buffer_head *cbh = NULL; /* For transactional checksums */ |
344 | __u32 crc32_sum = ~0; | 369 | __u32 crc32_sum = ~0; |
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
458 | commit_transaction->t_state = T_FLUSH; | 483 | commit_transaction->t_state = T_FLUSH; |
459 | journal->j_committing_transaction = commit_transaction; | 484 | journal->j_committing_transaction = commit_transaction; |
460 | journal->j_running_transaction = NULL; | 485 | journal->j_running_transaction = NULL; |
486 | start_time = ktime_get(); | ||
461 | commit_transaction->t_log_start = journal->j_head; | 487 | commit_transaction->t_log_start = journal->j_head; |
462 | wake_up(&journal->j_wait_transaction_locked); | 488 | wake_up(&journal->j_wait_transaction_locked); |
463 | spin_unlock(&journal->j_state_lock); | 489 | spin_unlock(&journal->j_state_lock); |
@@ -803,7 +829,7 @@ wait_for_iobuf: | |||
803 | __jbd2_journal_abort_hard(journal); | 829 | __jbd2_journal_abort_hard(journal); |
804 | } | 830 | } |
805 | if (!err && !is_journal_aborted(journal)) | 831 | if (!err && !is_journal_aborted(journal)) |
806 | err = journal_wait_on_commit_record(cbh); | 832 | err = journal_wait_on_commit_record(journal, cbh); |
807 | 833 | ||
808 | if (err) | 834 | if (err) |
809 | jbd2_journal_abort(journal, err); | 835 | jbd2_journal_abort(journal, err); |
@@ -981,14 +1007,23 @@ restart_loop: | |||
981 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 1007 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
982 | journal->j_commit_sequence = commit_transaction->t_tid; | 1008 | journal->j_commit_sequence = commit_transaction->t_tid; |
983 | journal->j_committing_transaction = NULL; | 1009 | journal->j_committing_transaction = NULL; |
984 | spin_unlock(&journal->j_state_lock); | 1010 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); |
985 | 1011 | ||
986 | if (journal->j_commit_callback) | 1012 | /* |
987 | journal->j_commit_callback(journal, commit_transaction); | 1013 | * weight the commit time higher than the average time so we don't |
1014 | * react too strongly to vast changes in the commit time | ||
1015 | */ | ||
1016 | if (likely(journal->j_average_commit_time)) | ||
1017 | journal->j_average_commit_time = (commit_time + | ||
1018 | journal->j_average_commit_time*3) / 4; | ||
1019 | else | ||
1020 | journal->j_average_commit_time = commit_time; | ||
1021 | spin_unlock(&journal->j_state_lock); | ||
988 | 1022 | ||
989 | if (commit_transaction->t_checkpoint_list == NULL && | 1023 | if (commit_transaction->t_checkpoint_list == NULL && |
990 | commit_transaction->t_checkpoint_io_list == NULL) { | 1024 | commit_transaction->t_checkpoint_io_list == NULL) { |
991 | __jbd2_journal_drop_transaction(journal, commit_transaction); | 1025 | __jbd2_journal_drop_transaction(journal, commit_transaction); |
1026 | to_free = 1; | ||
992 | } else { | 1027 | } else { |
993 | if (journal->j_checkpoint_transactions == NULL) { | 1028 | if (journal->j_checkpoint_transactions == NULL) { |
994 | journal->j_checkpoint_transactions = commit_transaction; | 1029 | journal->j_checkpoint_transactions = commit_transaction; |
@@ -1007,11 +1042,16 @@ restart_loop: | |||
1007 | } | 1042 | } |
1008 | spin_unlock(&journal->j_list_lock); | 1043 | spin_unlock(&journal->j_list_lock); |
1009 | 1044 | ||
1045 | if (journal->j_commit_callback) | ||
1046 | journal->j_commit_callback(journal, commit_transaction); | ||
1047 | |||
1010 | trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", | 1048 | trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", |
1011 | journal->j_devname, journal->j_commit_sequence, | 1049 | journal->j_devname, commit_transaction->t_tid, |
1012 | journal->j_tail_sequence); | 1050 | journal->j_tail_sequence); |
1013 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | 1051 | jbd_debug(1, "JBD: commit %d complete, head %d\n", |
1014 | journal->j_commit_sequence, journal->j_tail_sequence); | 1052 | journal->j_commit_sequence, journal->j_tail_sequence); |
1053 | if (to_free) | ||
1054 | kfree(commit_transaction); | ||
1015 | 1055 | ||
1016 | wake_up(&journal->j_wait_done_commit); | 1056 | wake_up(&journal->j_wait_done_commit); |
1017 | } | 1057 | } |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f6bff9d6f8df..56675306ed81 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -40,6 +40,7 @@ | |||
40 | 40 | ||
41 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
42 | #include <asm/page.h> | 42 | #include <asm/page.h> |
43 | #include <asm/div64.h> | ||
43 | 44 | ||
44 | EXPORT_SYMBOL(jbd2_journal_start); | 45 | EXPORT_SYMBOL(jbd2_journal_start); |
45 | EXPORT_SYMBOL(jbd2_journal_restart); | 46 | EXPORT_SYMBOL(jbd2_journal_restart); |
@@ -66,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format); | |||
66 | EXPORT_SYMBOL(jbd2_journal_check_used_features); | 67 | EXPORT_SYMBOL(jbd2_journal_check_used_features); |
67 | EXPORT_SYMBOL(jbd2_journal_check_available_features); | 68 | EXPORT_SYMBOL(jbd2_journal_check_available_features); |
68 | EXPORT_SYMBOL(jbd2_journal_set_features); | 69 | EXPORT_SYMBOL(jbd2_journal_set_features); |
69 | EXPORT_SYMBOL(jbd2_journal_create); | ||
70 | EXPORT_SYMBOL(jbd2_journal_load); | 70 | EXPORT_SYMBOL(jbd2_journal_load); |
71 | EXPORT_SYMBOL(jbd2_journal_destroy); | 71 | EXPORT_SYMBOL(jbd2_journal_destroy); |
72 | EXPORT_SYMBOL(jbd2_journal_abort); | 72 | EXPORT_SYMBOL(jbd2_journal_abort); |
@@ -132,8 +132,9 @@ static int kjournald2(void *arg) | |||
132 | journal->j_task = current; | 132 | journal->j_task = current; |
133 | wake_up(&journal->j_wait_done_commit); | 133 | wake_up(&journal->j_wait_done_commit); |
134 | 134 | ||
135 | printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", | 135 | printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " |
136 | journal->j_commit_interval / HZ); | 136 | "commit interval %ld seconds\n", current->pid, |
137 | journal->j_devname, journal->j_commit_interval / HZ); | ||
137 | 138 | ||
138 | /* | 139 | /* |
139 | * And now, wait forever for commit wakeup events. | 140 | * And now, wait forever for commit wakeup events. |
@@ -650,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | |||
650 | return NULL; | 651 | return NULL; |
651 | 652 | ||
652 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | 653 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); |
654 | if (!bh) | ||
655 | return NULL; | ||
653 | lock_buffer(bh); | 656 | lock_buffer(bh); |
654 | memset(bh->b_data, 0, journal->j_blocksize); | 657 | memset(bh->b_data, 0, journal->j_blocksize); |
655 | set_buffer_uptodate(bh); | 658 | set_buffer_uptodate(bh); |
@@ -843,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) | |||
843 | jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); | 846 | jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); |
844 | seq_printf(seq, " %ums logging transaction\n", | 847 | seq_printf(seq, " %ums logging transaction\n", |
845 | jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); | 848 | jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); |
849 | seq_printf(seq, " %luus average transaction commit time\n", | ||
850 | do_div(s->journal->j_average_commit_time, 1000)); | ||
846 | seq_printf(seq, " %lu handles per transaction\n", | 851 | seq_printf(seq, " %lu handles per transaction\n", |
847 | s->stats->u.run.rs_handle_count / s->stats->ts_tid); | 852 | s->stats->u.run.rs_handle_count / s->stats->ts_tid); |
848 | seq_printf(seq, " %lu blocks per transaction\n", | 853 | seq_printf(seq, " %lu blocks per transaction\n", |
@@ -980,6 +985,8 @@ static journal_t * journal_init_common (void) | |||
980 | spin_lock_init(&journal->j_state_lock); | 985 | spin_lock_init(&journal->j_state_lock); |
981 | 986 | ||
982 | journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); | 987 | journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); |
988 | journal->j_min_batch_time = 0; | ||
989 | journal->j_max_batch_time = 15000; /* 15ms */ | ||
983 | 990 | ||
984 | /* The journal is marked for error until we succeed with recovery! */ | 991 | /* The journal is marked for error until we succeed with recovery! */ |
985 | journal->j_flags = JBD2_ABORT; | 992 | journal->j_flags = JBD2_ABORT; |
@@ -1035,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, | |||
1035 | 1042 | ||
1036 | /* journal descriptor can store up to n blocks -bzzz */ | 1043 | /* journal descriptor can store up to n blocks -bzzz */ |
1037 | journal->j_blocksize = blocksize; | 1044 | journal->j_blocksize = blocksize; |
1045 | jbd2_stats_proc_init(journal); | ||
1038 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | 1046 | n = journal->j_blocksize / sizeof(journal_block_tag_t); |
1039 | journal->j_wbufsize = n; | 1047 | journal->j_wbufsize = n; |
1040 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); | 1048 | journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); |
1041 | if (!journal->j_wbuf) { | 1049 | if (!journal->j_wbuf) { |
1042 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | 1050 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", |
1043 | __func__); | 1051 | __func__); |
1044 | kfree(journal); | 1052 | goto out_err; |
1045 | journal = NULL; | ||
1046 | goto out; | ||
1047 | } | 1053 | } |
1048 | journal->j_dev = bdev; | 1054 | journal->j_dev = bdev; |
1049 | journal->j_fs_dev = fs_dev; | 1055 | journal->j_fs_dev = fs_dev; |
@@ -1053,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, | |||
1053 | p = journal->j_devname; | 1059 | p = journal->j_devname; |
1054 | while ((p = strchr(p, '/'))) | 1060 | while ((p = strchr(p, '/'))) |
1055 | *p = '!'; | 1061 | *p = '!'; |
1056 | jbd2_stats_proc_init(journal); | ||
1057 | 1062 | ||
1058 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); | 1063 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); |
1059 | J_ASSERT(bh != NULL); | 1064 | if (!bh) { |
1065 | printk(KERN_ERR | ||
1066 | "%s: Cannot get buffer for journal superblock\n", | ||
1067 | __func__); | ||
1068 | goto out_err; | ||
1069 | } | ||
1060 | journal->j_sb_buffer = bh; | 1070 | journal->j_sb_buffer = bh; |
1061 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | 1071 | journal->j_superblock = (journal_superblock_t *)bh->b_data; |
1062 | out: | 1072 | |
1063 | return journal; | 1073 | return journal; |
1074 | out_err: | ||
1075 | jbd2_stats_proc_exit(journal); | ||
1076 | kfree(journal); | ||
1077 | return NULL; | ||
1064 | } | 1078 | } |
1065 | 1079 | ||
1066 | /** | 1080 | /** |
@@ -1108,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) | |||
1108 | if (!journal->j_wbuf) { | 1122 | if (!journal->j_wbuf) { |
1109 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", | 1123 | printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", |
1110 | __func__); | 1124 | __func__); |
1111 | jbd2_stats_proc_exit(journal); | 1125 | goto out_err; |
1112 | kfree(journal); | ||
1113 | return NULL; | ||
1114 | } | 1126 | } |
1115 | 1127 | ||
1116 | err = jbd2_journal_bmap(journal, 0, &blocknr); | 1128 | err = jbd2_journal_bmap(journal, 0, &blocknr); |
@@ -1118,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) | |||
1118 | if (err) { | 1130 | if (err) { |
1119 | printk(KERN_ERR "%s: Cannnot locate journal superblock\n", | 1131 | printk(KERN_ERR "%s: Cannnot locate journal superblock\n", |
1120 | __func__); | 1132 | __func__); |
1121 | jbd2_stats_proc_exit(journal); | 1133 | goto out_err; |
1122 | kfree(journal); | ||
1123 | return NULL; | ||
1124 | } | 1134 | } |
1125 | 1135 | ||
1126 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | 1136 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); |
1127 | J_ASSERT(bh != NULL); | 1137 | if (!bh) { |
1138 | printk(KERN_ERR | ||
1139 | "%s: Cannot get buffer for journal superblock\n", | ||
1140 | __func__); | ||
1141 | goto out_err; | ||
1142 | } | ||
1128 | journal->j_sb_buffer = bh; | 1143 | journal->j_sb_buffer = bh; |
1129 | journal->j_superblock = (journal_superblock_t *)bh->b_data; | 1144 | journal->j_superblock = (journal_superblock_t *)bh->b_data; |
1130 | 1145 | ||
1131 | return journal; | 1146 | return journal; |
1147 | out_err: | ||
1148 | jbd2_stats_proc_exit(journal); | ||
1149 | kfree(journal); | ||
1150 | return NULL; | ||
1132 | } | 1151 | } |
1133 | 1152 | ||
1134 | /* | 1153 | /* |
@@ -1177,77 +1196,6 @@ static int journal_reset(journal_t *journal) | |||
1177 | } | 1196 | } |
1178 | 1197 | ||
1179 | /** | 1198 | /** |
1180 | * int jbd2_journal_create() - Initialise the new journal file | ||
1181 | * @journal: Journal to create. This structure must have been initialised | ||
1182 | * | ||
1183 | * Given a journal_t structure which tells us which disk blocks we can | ||
1184 | * use, create a new journal superblock and initialise all of the | ||
1185 | * journal fields from scratch. | ||
1186 | **/ | ||
1187 | int jbd2_journal_create(journal_t *journal) | ||
1188 | { | ||
1189 | unsigned long long blocknr; | ||
1190 | struct buffer_head *bh; | ||
1191 | journal_superblock_t *sb; | ||
1192 | int i, err; | ||
1193 | |||
1194 | if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) { | ||
1195 | printk (KERN_ERR "Journal length (%d blocks) too short.\n", | ||
1196 | journal->j_maxlen); | ||
1197 | journal_fail_superblock(journal); | ||
1198 | return -EINVAL; | ||
1199 | } | ||
1200 | |||
1201 | if (journal->j_inode == NULL) { | ||
1202 | /* | ||
1203 | * We don't know what block to start at! | ||
1204 | */ | ||
1205 | printk(KERN_EMERG | ||
1206 | "%s: creation of journal on external device!\n", | ||
1207 | __func__); | ||
1208 | BUG(); | ||
1209 | } | ||
1210 | |||
1211 | /* Zero out the entire journal on disk. We cannot afford to | ||
1212 | have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */ | ||
1213 | jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); | ||
1214 | for (i = 0; i < journal->j_maxlen; i++) { | ||
1215 | err = jbd2_journal_bmap(journal, i, &blocknr); | ||
1216 | if (err) | ||
1217 | return err; | ||
1218 | bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); | ||
1219 | lock_buffer(bh); | ||
1220 | memset (bh->b_data, 0, journal->j_blocksize); | ||
1221 | BUFFER_TRACE(bh, "marking dirty"); | ||
1222 | mark_buffer_dirty(bh); | ||
1223 | BUFFER_TRACE(bh, "marking uptodate"); | ||
1224 | set_buffer_uptodate(bh); | ||
1225 | unlock_buffer(bh); | ||
1226 | __brelse(bh); | ||
1227 | } | ||
1228 | |||
1229 | sync_blockdev(journal->j_dev); | ||
1230 | jbd_debug(1, "JBD: journal cleared.\n"); | ||
1231 | |||
1232 | /* OK, fill in the initial static fields in the new superblock */ | ||
1233 | sb = journal->j_superblock; | ||
1234 | |||
1235 | sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | ||
1236 | sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); | ||
1237 | |||
1238 | sb->s_blocksize = cpu_to_be32(journal->j_blocksize); | ||
1239 | sb->s_maxlen = cpu_to_be32(journal->j_maxlen); | ||
1240 | sb->s_first = cpu_to_be32(1); | ||
1241 | |||
1242 | journal->j_transaction_sequence = 1; | ||
1243 | |||
1244 | journal->j_flags &= ~JBD2_ABORT; | ||
1245 | journal->j_format_version = 2; | ||
1246 | |||
1247 | return journal_reset(journal); | ||
1248 | } | ||
1249 | |||
1250 | /** | ||
1251 | * void jbd2_journal_update_superblock() - Update journal sb on disk. | 1199 | * void jbd2_journal_update_superblock() - Update journal sb on disk. |
1252 | * @journal: The journal to update. | 1200 | * @journal: The journal to update. |
1253 | * @wait: Set to '0' if you don't want to wait for IO completion. | 1201 | * @wait: Set to '0' if you don't want to wait for IO completion. |
@@ -1491,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal) | |||
1491 | spin_lock(&journal->j_list_lock); | 1439 | spin_lock(&journal->j_list_lock); |
1492 | while (journal->j_checkpoint_transactions != NULL) { | 1440 | while (journal->j_checkpoint_transactions != NULL) { |
1493 | spin_unlock(&journal->j_list_lock); | 1441 | spin_unlock(&journal->j_list_lock); |
1442 | mutex_lock(&journal->j_checkpoint_mutex); | ||
1494 | jbd2_log_do_checkpoint(journal); | 1443 | jbd2_log_do_checkpoint(journal); |
1444 | mutex_unlock(&journal->j_checkpoint_mutex); | ||
1495 | spin_lock(&journal->j_list_lock); | 1445 | spin_lock(&journal->j_list_lock); |
1496 | } | 1446 | } |
1497 | 1447 | ||
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4f925a4f3d05..46b4e347ed7d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/timer.h> | 25 | #include <linux/timer.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/hrtimer.h> | ||
28 | 29 | ||
29 | static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); | 30 | static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); |
30 | 31 | ||
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
48 | { | 49 | { |
49 | transaction->t_journal = journal; | 50 | transaction->t_journal = journal; |
50 | transaction->t_state = T_RUNNING; | 51 | transaction->t_state = T_RUNNING; |
52 | transaction->t_start_time = ktime_get(); | ||
51 | transaction->t_tid = journal->j_transaction_sequence++; | 53 | transaction->t_tid = journal->j_transaction_sequence++; |
52 | transaction->t_expires = jiffies + journal->j_commit_interval; | 54 | transaction->t_expires = jiffies + journal->j_commit_interval; |
53 | spin_lock_init(&transaction->t_handle_lock); | 55 | spin_lock_init(&transaction->t_handle_lock); |
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle) | |||
1240 | { | 1242 | { |
1241 | transaction_t *transaction = handle->h_transaction; | 1243 | transaction_t *transaction = handle->h_transaction; |
1242 | journal_t *journal = transaction->t_journal; | 1244 | journal_t *journal = transaction->t_journal; |
1243 | int old_handle_count, err; | 1245 | int err; |
1244 | pid_t pid; | 1246 | pid_t pid; |
1245 | 1247 | ||
1246 | J_ASSERT(journal_current_handle() == handle); | 1248 | J_ASSERT(journal_current_handle() == handle); |
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle) | |||
1263 | /* | 1265 | /* |
1264 | * Implement synchronous transaction batching. If the handle | 1266 | * Implement synchronous transaction batching. If the handle |
1265 | * was synchronous, don't force a commit immediately. Let's | 1267 | * was synchronous, don't force a commit immediately. Let's |
1266 | * yield and let another thread piggyback onto this transaction. | 1268 | * yield and let another thread piggyback onto this |
1267 | * Keep doing that while new threads continue to arrive. | 1269 | * transaction. Keep doing that while new threads continue to |
1268 | * It doesn't cost much - we're about to run a commit and sleep | 1270 | * arrive. It doesn't cost much - we're about to run a commit |
1269 | * on IO anyway. Speeds up many-threaded, many-dir operations | 1271 | * and sleep on IO anyway. Speeds up many-threaded, many-dir |
1270 | * by 30x or more... | 1272 | * operations by 30x or more... |
1273 | * | ||
1274 | * We try and optimize the sleep time against what the | ||
1275 | * underlying disk can do, instead of having a static sleep | ||
1276 | * time. This is useful for the case where our storage is so | ||
1277 | * fast that it is more optimal to go ahead and force a flush | ||
1278 | * and wait for the transaction to be committed than it is to | ||
1279 | * wait for an arbitrary amount of time for new writers to | ||
1280 | * join the transaction. We achieve this by measuring how | ||
1281 | * long it takes to commit a transaction, and compare it with | ||
1282 | * how long this transaction has been running, and if run time | ||
1283 | * < commit time then we sleep for the delta and commit. This | ||
1284 | * greatly helps super fast disks that would see slowdowns as | ||
1285 | * more threads started doing fsyncs. | ||
1271 | * | 1286 | * |
1272 | * But don't do this if this process was the most recent one to | 1287 | * But don't do this if this process was the most recent one |
1273 | * perform a synchronous write. We do this to detect the case where a | 1288 | * to perform a synchronous write. We do this to detect the |
1274 | * single process is doing a stream of sync writes. No point in waiting | 1289 | * case where a single process is doing a stream of sync |
1275 | * for joiners in that case. | 1290 | * writes. No point in waiting for joiners in that case. |
1276 | */ | 1291 | */ |
1277 | pid = current->pid; | 1292 | pid = current->pid; |
1278 | if (handle->h_sync && journal->j_last_sync_writer != pid) { | 1293 | if (handle->h_sync && journal->j_last_sync_writer != pid) { |
1294 | u64 commit_time, trans_time; | ||
1295 | |||
1279 | journal->j_last_sync_writer = pid; | 1296 | journal->j_last_sync_writer = pid; |
1280 | do { | 1297 | |
1281 | old_handle_count = transaction->t_handle_count; | 1298 | spin_lock(&journal->j_state_lock); |
1282 | schedule_timeout_uninterruptible(1); | 1299 | commit_time = journal->j_average_commit_time; |
1283 | } while (old_handle_count != transaction->t_handle_count); | 1300 | spin_unlock(&journal->j_state_lock); |
1301 | |||
1302 | trans_time = ktime_to_ns(ktime_sub(ktime_get(), | ||
1303 | transaction->t_start_time)); | ||
1304 | |||
1305 | commit_time = max_t(u64, commit_time, | ||
1306 | 1000*journal->j_min_batch_time); | ||
1307 | commit_time = min_t(u64, commit_time, | ||
1308 | 1000*journal->j_max_batch_time); | ||
1309 | |||
1310 | if (trans_time < commit_time) { | ||
1311 | ktime_t expires = ktime_add_ns(ktime_get(), | ||
1312 | commit_time); | ||
1313 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1314 | schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); | ||
1315 | } | ||
1284 | } | 1316 | } |
1285 | 1317 | ||
1286 | current->journal_info = NULL; | 1318 | current->journal_info = NULL; |
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index c73fa89b5f8a..170d289ac785 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c | |||
@@ -22,9 +22,7 @@ | |||
22 | 22 | ||
23 | 23 | ||
24 | #define BIT_DIVIDER_MIPS 1043 | 24 | #define BIT_DIVIDER_MIPS 1043 |
25 | static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ | 25 | static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; |
26 | |||
27 | #include <linux/errno.h> | ||
28 | 26 | ||
29 | struct pushpull { | 27 | struct pushpull { |
30 | unsigned char *buf; | 28 | unsigned char *buf; |
@@ -43,7 +41,9 @@ struct rubin_state { | |||
43 | int bits[8]; | 41 | int bits[8]; |
44 | }; | 42 | }; |
45 | 43 | ||
46 | static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) | 44 | static inline void init_pushpull(struct pushpull *pp, char *buf, |
45 | unsigned buflen, unsigned ofs, | ||
46 | unsigned reserve) | ||
47 | { | 47 | { |
48 | pp->buf = buf; | 48 | pp->buf = buf; |
49 | pp->buflen = buflen; | 49 | pp->buflen = buflen; |
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen | |||
53 | 53 | ||
54 | static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) | 54 | static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) |
55 | { | 55 | { |
56 | if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { | 56 | if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) |
57 | return -ENOSPC; | 57 | return -ENOSPC; |
58 | } | ||
59 | 58 | ||
60 | if (bit) { | 59 | if (bit) |
61 | pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); | 60 | pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); |
62 | } | 61 | else |
63 | else { | 62 | pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); |
64 | pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); | 63 | |
65 | } | ||
66 | pp->ofs++; | 64 | pp->ofs++; |
67 | 65 | ||
68 | return 0; | 66 | return 0; |
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits) | |||
97 | rs->p = (long) (2 * UPPER_BIT_RUBIN); | 95 | rs->p = (long) (2 * UPPER_BIT_RUBIN); |
98 | rs->bit_number = (long) 0; | 96 | rs->bit_number = (long) 0; |
99 | rs->bit_divider = div; | 97 | rs->bit_divider = div; |
98 | |||
100 | for (c=0; c<8; c++) | 99 | for (c=0; c<8; c++) |
101 | rs->bits[c] = bits[c]; | 100 | rs->bits[c] = bits[c]; |
102 | } | 101 | } |
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) | |||
108 | long i0, i1; | 107 | long i0, i1; |
109 | int ret; | 108 | int ret; |
110 | 109 | ||
111 | while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { | 110 | while ((rs->q >= UPPER_BIT_RUBIN) || |
111 | ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { | ||
112 | rs->bit_number++; | 112 | rs->bit_number++; |
113 | 113 | ||
114 | ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); | 114 | ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); |
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) | |||
119 | rs->p <<= 1; | 119 | rs->p <<= 1; |
120 | } | 120 | } |
121 | i0 = A * rs->p / (A + B); | 121 | i0 = A * rs->p / (A + B); |
122 | if (i0 <= 0) { | 122 | if (i0 <= 0) |
123 | i0 = 1; | 123 | i0 = 1; |
124 | } | 124 | |
125 | if (i0 >= rs->p) { | 125 | if (i0 >= rs->p) |
126 | i0 = rs->p - 1; | 126 | i0 = rs->p - 1; |
127 | } | 127 | |
128 | i1 = rs->p - i0; | 128 | i1 = rs->p - i0; |
129 | 129 | ||
130 | if (symbol == 0) | 130 | if (symbol == 0) |
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits) | |||
157 | /* behalve lower */ | 157 | /* behalve lower */ |
158 | rs->rec_q = 0; | 158 | rs->rec_q = 0; |
159 | 159 | ||
160 | for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) | 160 | for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; |
161 | rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) | ||
161 | ; | 162 | ; |
162 | } | 163 | } |
163 | 164 | ||
164 | static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) | 165 | static void __do_decode(struct rubin_state *rs, unsigned long p, |
166 | unsigned long q) | ||
165 | { | 167 | { |
166 | register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; | 168 | register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; |
167 | unsigned long rec_q; | 169 | unsigned long rec_q; |
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B) | |||
207 | __do_decode(rs, p, q); | 209 | __do_decode(rs, p, q); |
208 | 210 | ||
209 | i0 = A * rs->p / (A + B); | 211 | i0 = A * rs->p / (A + B); |
210 | if (i0 <= 0) { | 212 | if (i0 <= 0) |
211 | i0 = 1; | 213 | i0 = 1; |
212 | } | 214 | |
213 | if (i0 >= rs->p) { | 215 | if (i0 >= rs->p) |
214 | i0 = rs->p - 1; | 216 | i0 = rs->p - 1; |
215 | } | ||
216 | 217 | ||
217 | threshold = rs->q + i0; | 218 | threshold = rs->q + i0; |
218 | symbol = rs->rec_q >= threshold; | 219 | symbol = rs->rec_q >= threshold; |
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte) | |||
234 | struct rubin_state rs_copy; | 235 | struct rubin_state rs_copy; |
235 | rs_copy = *rs; | 236 | rs_copy = *rs; |
236 | 237 | ||
237 | for (i=0;i<8;i++) { | 238 | for (i=0; i<8; i++) { |
238 | ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); | 239 | ret = encode(rs, rs->bit_divider-rs->bits[i], |
240 | rs->bits[i], byte & 1); | ||
239 | if (ret) { | 241 | if (ret) { |
240 | /* Failed. Restore old state */ | 242 | /* Failed. Restore old state */ |
241 | *rs = rs_copy; | 243 | *rs = rs_copy; |
242 | return ret; | 244 | return ret; |
243 | } | 245 | } |
244 | byte=byte>>1; | 246 | byte >>= 1 ; |
245 | } | 247 | } |
246 | return 0; | 248 | return 0; |
247 | } | 249 | } |
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs) | |||
251 | int i, result = 0, bit_divider = rs->bit_divider; | 253 | int i, result = 0, bit_divider = rs->bit_divider; |
252 | 254 | ||
253 | for (i = 0; i < 8; i++) | 255 | for (i = 0; i < 8; i++) |
254 | result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; | 256 | result |= decode(rs, bit_divider - rs->bits[i], |
257 | rs->bits[i]) << i; | ||
255 | 258 | ||
256 | return result; | 259 | return result; |
257 | } | 260 | } |
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs) | |||
259 | 262 | ||
260 | 263 | ||
261 | static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, | 264 | static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, |
262 | unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) | 265 | unsigned char *cpage_out, uint32_t *sourcelen, |
266 | uint32_t *dstlen) | ||
263 | { | 267 | { |
264 | int outpos = 0; | 268 | int outpos = 0; |
265 | int pos=0; | 269 | int pos=0; |
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, | |||
295 | int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, | 299 | int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, |
296 | uint32_t *sourcelen, uint32_t *dstlen, void *model) | 300 | uint32_t *sourcelen, uint32_t *dstlen, void *model) |
297 | { | 301 | { |
298 | return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); | 302 | return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, |
303 | cpage_out, sourcelen, dstlen); | ||
299 | } | 304 | } |
300 | #endif | 305 | #endif |
301 | static int jffs2_dynrubin_compress(unsigned char *data_in, | 306 | static int jffs2_dynrubin_compress(unsigned char *data_in, |
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, | |||
316 | return -1; | 321 | return -1; |
317 | 322 | ||
318 | memset(histo, 0, 256); | 323 | memset(histo, 0, 256); |
319 | for (i=0; i<mysrclen; i++) { | 324 | for (i=0; i<mysrclen; i++) |
320 | histo[data_in[i]]++; | 325 | histo[data_in[i]]++; |
321 | } | ||
322 | memset(bits, 0, sizeof(int)*8); | 326 | memset(bits, 0, sizeof(int)*8); |
323 | for (i=0; i<256; i++) { | 327 | for (i=0; i<256; i++) { |
324 | if (i&128) | 328 | if (i&128) |
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, | |||
346 | cpage_out[i] = bits[i]; | 350 | cpage_out[i] = bits[i]; |
347 | } | 351 | } |
348 | 352 | ||
349 | ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); | 353 | ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, |
354 | &mydstlen); | ||
350 | if (ret) | 355 | if (ret) |
351 | return ret; | 356 | return ret; |
352 | 357 | ||
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, | |||
363 | return 0; | 368 | return 0; |
364 | } | 369 | } |
365 | 370 | ||
366 | static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, | 371 | static void rubin_do_decompress(int bit_divider, int *bits, |
367 | unsigned char *page_out, uint32_t srclen, uint32_t destlen) | 372 | unsigned char *cdata_in, |
373 | unsigned char *page_out, uint32_t srclen, | ||
374 | uint32_t destlen) | ||
368 | { | 375 | { |
369 | int outpos = 0; | 376 | int outpos = 0; |
370 | struct rubin_state rs; | 377 | struct rubin_state rs; |
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata | |||
372 | init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); | 379 | init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); |
373 | init_decode(&rs, bit_divider, bits); | 380 | init_decode(&rs, bit_divider, bits); |
374 | 381 | ||
375 | while (outpos < destlen) { | 382 | while (outpos < destlen) |
376 | page_out[outpos++] = in_byte(&rs); | 383 | page_out[outpos++] = in_byte(&rs); |
377 | } | ||
378 | } | 384 | } |
379 | 385 | ||
380 | 386 | ||
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, | |||
383 | uint32_t sourcelen, uint32_t dstlen, | 389 | uint32_t sourcelen, uint32_t dstlen, |
384 | void *model) | 390 | void *model) |
385 | { | 391 | { |
386 | rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); | 392 | rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, |
393 | cpage_out, sourcelen, dstlen); | ||
387 | return 0; | 394 | return 0; |
388 | } | 395 | } |
389 | 396 | ||
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in, | |||
398 | for (c=0; c<8; c++) | 405 | for (c=0; c<8; c++) |
399 | bits[c] = data_in[c]; | 406 | bits[c] = data_in[c]; |
400 | 407 | ||
401 | rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); | 408 | rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, |
409 | dstlen); | ||
402 | return 0; | 410 | return 0; |
403 | } | 411 | } |
404 | 412 | ||
405 | static struct jffs2_compressor jffs2_rubinmips_comp = { | 413 | static struct jffs2_compressor jffs2_rubinmips_comp = { |
406 | .priority = JFFS2_RUBINMIPS_PRIORITY, | 414 | .priority = JFFS2_RUBINMIPS_PRIORITY, |
407 | .name = "rubinmips", | 415 | .name = "rubinmips", |
408 | .compr = JFFS2_COMPR_DYNRUBIN, | 416 | .compr = JFFS2_COMPR_DYNRUBIN, |
409 | .compress = NULL, /*&jffs2_rubinmips_compress,*/ | 417 | .compress = NULL, /*&jffs2_rubinmips_compress,*/ |
410 | .decompress = &jffs2_rubinmips_decompress, | 418 | .decompress = &jffs2_rubinmips_decompress, |
411 | #ifdef JFFS2_RUBINMIPS_DISABLED | 419 | #ifdef JFFS2_RUBINMIPS_DISABLED |
412 | .disabled = 1, | 420 | .disabled = 1, |
413 | #else | 421 | #else |
414 | .disabled = 0, | 422 | .disabled = 0, |
415 | #endif | 423 | #endif |
416 | }; | 424 | }; |
417 | 425 | ||
418 | int jffs2_rubinmips_init(void) | 426 | int jffs2_rubinmips_init(void) |
419 | { | 427 | { |
420 | return jffs2_register_compressor(&jffs2_rubinmips_comp); | 428 | return jffs2_register_compressor(&jffs2_rubinmips_comp); |
421 | } | 429 | } |
422 | 430 | ||
423 | void jffs2_rubinmips_exit(void) | 431 | void jffs2_rubinmips_exit(void) |
424 | { | 432 | { |
425 | jffs2_unregister_compressor(&jffs2_rubinmips_comp); | 433 | jffs2_unregister_compressor(&jffs2_rubinmips_comp); |
426 | } | 434 | } |
427 | 435 | ||
428 | static struct jffs2_compressor jffs2_dynrubin_comp = { | 436 | static struct jffs2_compressor jffs2_dynrubin_comp = { |
429 | .priority = JFFS2_DYNRUBIN_PRIORITY, | 437 | .priority = JFFS2_DYNRUBIN_PRIORITY, |
430 | .name = "dynrubin", | 438 | .name = "dynrubin", |
431 | .compr = JFFS2_COMPR_RUBINMIPS, | 439 | .compr = JFFS2_COMPR_RUBINMIPS, |
432 | .compress = jffs2_dynrubin_compress, | 440 | .compress = jffs2_dynrubin_compress, |
433 | .decompress = &jffs2_dynrubin_decompress, | 441 | .decompress = &jffs2_dynrubin_decompress, |
434 | #ifdef JFFS2_DYNRUBIN_DISABLED | 442 | #ifdef JFFS2_DYNRUBIN_DISABLED |
435 | .disabled = 1, | 443 | .disabled = 1, |
436 | #else | 444 | #else |
437 | .disabled = 0, | 445 | .disabled = 0, |
438 | #endif | 446 | #endif |
439 | }; | 447 | }; |
440 | 448 | ||
441 | int jffs2_dynrubin_init(void) | 449 | int jffs2_dynrubin_init(void) |
442 | { | 450 | { |
443 | return jffs2_register_compressor(&jffs2_dynrubin_comp); | 451 | return jffs2_register_compressor(&jffs2_dynrubin_comp); |
444 | } | 452 | } |
445 | 453 | ||
446 | void jffs2_dynrubin_exit(void) | 454 | void jffs2_dynrubin_exit(void) |
447 | { | 455 | { |
448 | jffs2_unregister_compressor(&jffs2_dynrubin_comp); | 456 | jffs2_unregister_compressor(&jffs2_dynrubin_comp); |
449 | } | 457 | } |
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 259461b910af..c32b4a1ad6cf 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c | |||
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock | |||
175 | { | 175 | { |
176 | /* For NAND, if the failure did not occur at the device level for a | 176 | /* For NAND, if the failure did not occur at the device level for a |
177 | specific physical page, don't bother updating the bad block table. */ | 177 | specific physical page, don't bother updating the bad block table. */ |
178 | if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { | 178 | if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { |
179 | /* We had a device-level failure to erase. Let's see if we've | 179 | /* We had a device-level failure to erase. Let's see if we've |
180 | failed too many times. */ | 180 | failed too many times. */ |
181 | if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { | 181 | if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { |
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr) | |||
209 | struct erase_priv_struct *priv = (void *)instr->priv; | 209 | struct erase_priv_struct *priv = (void *)instr->priv; |
210 | 210 | ||
211 | if(instr->state != MTD_ERASE_DONE) { | 211 | if(instr->state != MTD_ERASE_DONE) { |
212 | printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); | 212 | printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", |
213 | (unsigned long long)instr->addr, instr->state); | ||
213 | jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); | 214 | jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); |
214 | } else { | 215 | } else { |
215 | jffs2_erase_succeeded(priv->c, priv->jeb); | 216 | jffs2_erase_succeeded(priv->c, priv->jeb); |
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 1750445556c3..507ed6ec1847 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h | |||
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c); | |||
366 | void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); | 366 | void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); |
367 | struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); | 367 | struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); |
368 | void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); | 368 | void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); |
369 | struct rb_node *rb_next(struct rb_node *); | ||
370 | struct rb_node *rb_prev(struct rb_node *); | ||
371 | void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); | ||
372 | int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); | 369 | int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); |
373 | uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); | 370 | uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); |
374 | struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, | 371 | struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, |
diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0dae345e481b..b37d1f78b854 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c | |||
@@ -543,7 +543,7 @@ out_kfree: | |||
543 | return ret; | 543 | return ret; |
544 | } | 544 | } |
545 | 545 | ||
546 | static void jfs_write_super_lockfs(struct super_block *sb) | 546 | static int jfs_freeze(struct super_block *sb) |
547 | { | 547 | { |
548 | struct jfs_sb_info *sbi = JFS_SBI(sb); | 548 | struct jfs_sb_info *sbi = JFS_SBI(sb); |
549 | struct jfs_log *log = sbi->log; | 549 | struct jfs_log *log = sbi->log; |
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb) | |||
553 | lmLogShutdown(log); | 553 | lmLogShutdown(log); |
554 | updateSuper(sb, FM_CLEAN); | 554 | updateSuper(sb, FM_CLEAN); |
555 | } | 555 | } |
556 | return 0; | ||
556 | } | 557 | } |
557 | 558 | ||
558 | static void jfs_unlockfs(struct super_block *sb) | 559 | static int jfs_unfreeze(struct super_block *sb) |
559 | { | 560 | { |
560 | struct jfs_sb_info *sbi = JFS_SBI(sb); | 561 | struct jfs_sb_info *sbi = JFS_SBI(sb); |
561 | struct jfs_log *log = sbi->log; | 562 | struct jfs_log *log = sbi->log; |
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb) | |||
568 | else | 569 | else |
569 | txResume(sb); | 570 | txResume(sb); |
570 | } | 571 | } |
572 | return 0; | ||
571 | } | 573 | } |
572 | 574 | ||
573 | static int jfs_get_sb(struct file_system_type *fs_type, | 575 | static int jfs_get_sb(struct file_system_type *fs_type, |
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = { | |||
735 | .delete_inode = jfs_delete_inode, | 737 | .delete_inode = jfs_delete_inode, |
736 | .put_super = jfs_put_super, | 738 | .put_super = jfs_put_super, |
737 | .sync_fs = jfs_sync_fs, | 739 | .sync_fs = jfs_sync_fs, |
738 | .write_super_lockfs = jfs_write_super_lockfs, | 740 | .freeze_fs = jfs_freeze, |
739 | .unlockfs = jfs_unlockfs, | 741 | .unfreeze_fs = jfs_unfreeze, |
740 | .statfs = jfs_statfs, | 742 | .statfs = jfs_statfs, |
741 | .remount_fs = jfs_remount, | 743 | .remount_fs = jfs_remount, |
742 | .show_options = jfs_show_options, | 744 | .show_options = jfs_show_options, |
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 31668b690e03..dd7957064a8c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/sunrpc/clnt.h> | 16 | #include <linux/sunrpc/clnt.h> |
17 | #include <linux/sunrpc/svc.h> | 17 | #include <linux/sunrpc/svc.h> |
18 | #include <linux/lockd/lockd.h> | 18 | #include <linux/lockd/lockd.h> |
19 | #include <linux/lockd/sm_inter.h> | ||
20 | 19 | ||
21 | #define NLMDBG_FACILITY NLMDBG_CLIENT | 20 | #define NLMDBG_FACILITY NLMDBG_CLIENT |
22 | #define NLMCLNT_GRACE_WAIT (5*HZ) | 21 | #define NLMCLNT_GRACE_WAIT (5*HZ) |
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) | |||
518 | unsigned char fl_type; | 517 | unsigned char fl_type; |
519 | int status = -ENOLCK; | 518 | int status = -ENOLCK; |
520 | 519 | ||
521 | if (nsm_monitor(host) < 0) { | 520 | if (nsm_monitor(host) < 0) |
522 | printk(KERN_NOTICE "lockd: failed to monitor %s\n", | ||
523 | host->h_name); | ||
524 | goto out; | 521 | goto out; |
525 | } | 522 | |
526 | fl->fl_flags |= FL_ACCESS; | 523 | fl->fl_flags |= FL_ACCESS; |
527 | status = do_vfs_lock(fl); | 524 | status = do_vfs_lock(fl); |
528 | fl->fl_flags = fl_flags; | 525 | fl->fl_flags = fl_flags; |
diff --git a/fs/lockd/host.c b/fs/lockd/host.c index abdebf76b820..99d737bd4325 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/sunrpc/clnt.h> | 15 | #include <linux/sunrpc/clnt.h> |
16 | #include <linux/sunrpc/svc.h> | 16 | #include <linux/sunrpc/svc.h> |
17 | #include <linux/lockd/lockd.h> | 17 | #include <linux/lockd/lockd.h> |
18 | #include <linux/lockd/sm_inter.h> | ||
19 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
20 | 19 | ||
21 | #include <net/ipv6.h> | 20 | #include <net/ipv6.h> |
@@ -32,11 +31,6 @@ static int nrhosts; | |||
32 | static DEFINE_MUTEX(nlm_host_mutex); | 31 | static DEFINE_MUTEX(nlm_host_mutex); |
33 | 32 | ||
34 | static void nlm_gc_hosts(void); | 33 | static void nlm_gc_hosts(void); |
35 | static struct nsm_handle *nsm_find(const struct sockaddr *sap, | ||
36 | const size_t salen, | ||
37 | const char *hostname, | ||
38 | const size_t hostname_len, | ||
39 | const int create); | ||
40 | 34 | ||
41 | struct nlm_lookup_host_info { | 35 | struct nlm_lookup_host_info { |
42 | const int server; /* search for server|client */ | 36 | const int server; /* search for server|client */ |
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap) | |||
105 | } | 99 | } |
106 | } | 100 | } |
107 | 101 | ||
108 | static void nlm_display_address(const struct sockaddr *sap, | ||
109 | char *buf, const size_t len) | ||
110 | { | ||
111 | const struct sockaddr_in *sin = (struct sockaddr_in *)sap; | ||
112 | const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; | ||
113 | |||
114 | switch (sap->sa_family) { | ||
115 | case AF_UNSPEC: | ||
116 | snprintf(buf, len, "unspecified"); | ||
117 | break; | ||
118 | case AF_INET: | ||
119 | snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); | ||
120 | break; | ||
121 | case AF_INET6: | ||
122 | if (ipv6_addr_v4mapped(&sin6->sin6_addr)) | ||
123 | snprintf(buf, len, "%pI4", | ||
124 | &sin6->sin6_addr.s6_addr32[3]); | ||
125 | else | ||
126 | snprintf(buf, len, "%pI6", &sin6->sin6_addr); | ||
127 | break; | ||
128 | default: | ||
129 | snprintf(buf, len, "unsupported address family"); | ||
130 | break; | ||
131 | } | ||
132 | } | ||
133 | |||
134 | /* | 102 | /* |
135 | * Common host lookup routine for server & client | 103 | * Common host lookup routine for server & client |
136 | */ | 104 | */ |
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) | |||
190 | atomic_inc(&nsm->sm_count); | 158 | atomic_inc(&nsm->sm_count); |
191 | else { | 159 | else { |
192 | host = NULL; | 160 | host = NULL; |
193 | nsm = nsm_find(ni->sap, ni->salen, | 161 | nsm = nsm_get_handle(ni->sap, ni->salen, |
194 | ni->hostname, ni->hostname_len, 1); | 162 | ni->hostname, ni->hostname_len); |
195 | if (!nsm) { | 163 | if (!nsm) { |
196 | dprintk("lockd: nlm_lookup_host failed; " | 164 | dprintk("lockd: nlm_lookup_host failed; " |
197 | "no nsm handle\n"); | 165 | "no nsm handle\n"); |
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) | |||
206 | goto out; | 174 | goto out; |
207 | } | 175 | } |
208 | host->h_name = nsm->sm_name; | 176 | host->h_name = nsm->sm_name; |
177 | host->h_addrbuf = nsm->sm_addrbuf; | ||
209 | memcpy(nlm_addr(host), ni->sap, ni->salen); | 178 | memcpy(nlm_addr(host), ni->sap, ni->salen); |
210 | host->h_addrlen = ni->salen; | 179 | host->h_addrlen = ni->salen; |
211 | nlm_clear_port(nlm_addr(host)); | 180 | nlm_clear_port(nlm_addr(host)); |
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) | |||
232 | 201 | ||
233 | nrhosts++; | 202 | nrhosts++; |
234 | 203 | ||
235 | nlm_display_address((struct sockaddr *)&host->h_addr, | ||
236 | host->h_addrbuf, sizeof(host->h_addrbuf)); | ||
237 | nlm_display_address((struct sockaddr *)&host->h_srcaddr, | ||
238 | host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); | ||
239 | |||
240 | dprintk("lockd: nlm_lookup_host created host %s\n", | 204 | dprintk("lockd: nlm_lookup_host created host %s\n", |
241 | host->h_name); | 205 | host->h_name); |
242 | 206 | ||
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host) | |||
256 | BUG_ON(!list_empty(&host->h_lockowners)); | 220 | BUG_ON(!list_empty(&host->h_lockowners)); |
257 | BUG_ON(atomic_read(&host->h_count)); | 221 | BUG_ON(atomic_read(&host->h_count)); |
258 | 222 | ||
259 | /* | ||
260 | * Release NSM handle and unmonitor host. | ||
261 | */ | ||
262 | nsm_unmonitor(host); | 223 | nsm_unmonitor(host); |
224 | nsm_release(host->h_nsmhandle); | ||
263 | 225 | ||
264 | clnt = host->h_rpcclnt; | 226 | clnt = host->h_rpcclnt; |
265 | if (clnt != NULL) | 227 | if (clnt != NULL) |
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host) | |||
378 | { | 340 | { |
379 | struct rpc_clnt *clnt; | 341 | struct rpc_clnt *clnt; |
380 | 342 | ||
381 | dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", | 343 | dprintk("lockd: nlm_bind_host %s (%s)\n", |
382 | host->h_name, host->h_addrbuf, host->h_srcaddrbuf); | 344 | host->h_name, host->h_addrbuf); |
383 | 345 | ||
384 | /* Lock host handle */ | 346 | /* Lock host handle */ |
385 | mutex_lock(&host->h_mutex); | 347 | mutex_lock(&host->h_mutex); |
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host) | |||
481 | } | 443 | } |
482 | } | 444 | } |
483 | 445 | ||
484 | /* | 446 | /** |
485 | * We were notified that the host indicated by address &sin | 447 | * nlm_host_rebooted - Release all resources held by rebooted host |
486 | * has rebooted. | 448 | * @info: pointer to decoded results of NLM_SM_NOTIFY call |
487 | * Release all resources held by that peer. | 449 | * |
450 | * We were notified that the specified host has rebooted. Release | ||
451 | * all resources held by that peer. | ||
488 | */ | 452 | */ |
489 | void nlm_host_rebooted(const struct sockaddr_in *sin, | 453 | void nlm_host_rebooted(const struct nlm_reboot *info) |
490 | const char *hostname, | ||
491 | unsigned int hostname_len, | ||
492 | u32 new_state) | ||
493 | { | 454 | { |
494 | struct hlist_head *chain; | 455 | struct hlist_head *chain; |
495 | struct hlist_node *pos; | 456 | struct hlist_node *pos; |
496 | struct nsm_handle *nsm; | 457 | struct nsm_handle *nsm; |
497 | struct nlm_host *host; | 458 | struct nlm_host *host; |
498 | 459 | ||
499 | nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), | 460 | nsm = nsm_reboot_lookup(info); |
500 | hostname, hostname_len, 0); | 461 | if (unlikely(nsm == NULL)) |
501 | if (nsm == NULL) { | ||
502 | dprintk("lockd: never saw rebooted peer '%.*s' before\n", | ||
503 | hostname_len, hostname); | ||
504 | return; | 462 | return; |
505 | } | ||
506 | |||
507 | dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", | ||
508 | hostname_len, hostname, nsm->sm_addrbuf); | ||
509 | |||
510 | /* When reclaiming locks on this peer, make sure that | ||
511 | * we set up a new notification */ | ||
512 | nsm->sm_monitored = 0; | ||
513 | 463 | ||
514 | /* Mark all hosts tied to this NSM state as having rebooted. | 464 | /* Mark all hosts tied to this NSM state as having rebooted. |
515 | * We run the loop repeatedly, because we drop the host table | 465 | * We run the loop repeatedly, because we drop the host table |
@@ -520,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex); | |||
520 | for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { | 470 | for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { |
521 | hlist_for_each_entry(host, pos, chain, h_hash) { | 471 | hlist_for_each_entry(host, pos, chain, h_hash) { |
522 | if (host->h_nsmhandle == nsm | 472 | if (host->h_nsmhandle == nsm |
523 | && host->h_nsmstate != new_state) { | 473 | && host->h_nsmstate != info->state) { |
524 | host->h_nsmstate = new_state; | 474 | host->h_nsmstate = info->state; |
525 | host->h_state++; | 475 | host->h_state++; |
526 | 476 | ||
527 | nlm_get_host(host); | 477 | nlm_get_host(host); |
@@ -629,89 +579,3 @@ nlm_gc_hosts(void) | |||
629 | 579 | ||
630 | next_gc = jiffies + NLM_HOST_COLLECT; | 580 | next_gc = jiffies + NLM_HOST_COLLECT; |
631 | } | 581 | } |
632 | |||
633 | |||
634 | /* | ||
635 | * Manage NSM handles | ||
636 | */ | ||
637 | static LIST_HEAD(nsm_handles); | ||
638 | static DEFINE_SPINLOCK(nsm_lock); | ||
639 | |||
640 | static struct nsm_handle *nsm_find(const struct sockaddr *sap, | ||
641 | const size_t salen, | ||
642 | const char *hostname, | ||
643 | const size_t hostname_len, | ||
644 | const int create) | ||
645 | { | ||
646 | struct nsm_handle *nsm = NULL; | ||
647 | struct nsm_handle *pos; | ||
648 | |||
649 | if (!sap) | ||
650 | return NULL; | ||
651 | |||
652 | if (hostname && memchr(hostname, '/', hostname_len) != NULL) { | ||
653 | if (printk_ratelimit()) { | ||
654 | printk(KERN_WARNING "Invalid hostname \"%.*s\" " | ||
655 | "in NFS lock request\n", | ||
656 | (int)hostname_len, hostname); | ||
657 | } | ||
658 | return NULL; | ||
659 | } | ||
660 | |||
661 | retry: | ||
662 | spin_lock(&nsm_lock); | ||
663 | list_for_each_entry(pos, &nsm_handles, sm_link) { | ||
664 | |||
665 | if (hostname && nsm_use_hostnames) { | ||
666 | if (strlen(pos->sm_name) != hostname_len | ||
667 | || memcmp(pos->sm_name, hostname, hostname_len)) | ||
668 | continue; | ||
669 | } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) | ||
670 | continue; | ||
671 | atomic_inc(&pos->sm_count); | ||
672 | kfree(nsm); | ||
673 | nsm = pos; | ||
674 | goto found; | ||
675 | } | ||
676 | if (nsm) { | ||
677 | list_add(&nsm->sm_link, &nsm_handles); | ||
678 | goto found; | ||
679 | } | ||
680 | spin_unlock(&nsm_lock); | ||
681 | |||
682 | if (!create) | ||
683 | return NULL; | ||
684 | |||
685 | nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL); | ||
686 | if (nsm == NULL) | ||
687 | return NULL; | ||
688 | |||
689 | memcpy(nsm_addr(nsm), sap, salen); | ||
690 | nsm->sm_addrlen = salen; | ||
691 | nsm->sm_name = (char *) (nsm + 1); | ||
692 | memcpy(nsm->sm_name, hostname, hostname_len); | ||
693 | nsm->sm_name[hostname_len] = '\0'; | ||
694 | nlm_display_address((struct sockaddr *)&nsm->sm_addr, | ||
695 | nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); | ||
696 | atomic_set(&nsm->sm_count, 1); | ||
697 | goto retry; | ||
698 | |||
699 | found: | ||
700 | spin_unlock(&nsm_lock); | ||
701 | return nsm; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Release an NSM handle | ||
706 | */ | ||
707 | void | ||
708 | nsm_release(struct nsm_handle *nsm) | ||
709 | { | ||
710 | if (!nsm) | ||
711 | return; | ||
712 | if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { | ||
713 | list_del(&nsm->sm_link); | ||
714 | spin_unlock(&nsm_lock); | ||
715 | kfree(nsm); | ||
716 | } | ||
717 | } | ||
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index ffd3461f75ef..5e2c4d5ac827 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c | |||
@@ -9,35 +9,123 @@ | |||
9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
10 | #include <linux/utsname.h> | 10 | #include <linux/utsname.h> |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/ktime.h> | ||
13 | |||
12 | #include <linux/sunrpc/clnt.h> | 14 | #include <linux/sunrpc/clnt.h> |
13 | #include <linux/sunrpc/xprtsock.h> | 15 | #include <linux/sunrpc/xprtsock.h> |
14 | #include <linux/sunrpc/svc.h> | 16 | #include <linux/sunrpc/svc.h> |
15 | #include <linux/lockd/lockd.h> | 17 | #include <linux/lockd/lockd.h> |
16 | #include <linux/lockd/sm_inter.h> | ||
17 | |||
18 | 18 | ||
19 | #define NLMDBG_FACILITY NLMDBG_MONITOR | 19 | #define NLMDBG_FACILITY NLMDBG_MONITOR |
20 | #define NSM_PROGRAM 100024 | ||
21 | #define NSM_VERSION 1 | ||
22 | |||
23 | enum { | ||
24 | NSMPROC_NULL, | ||
25 | NSMPROC_STAT, | ||
26 | NSMPROC_MON, | ||
27 | NSMPROC_UNMON, | ||
28 | NSMPROC_UNMON_ALL, | ||
29 | NSMPROC_SIMU_CRASH, | ||
30 | NSMPROC_NOTIFY, | ||
31 | }; | ||
32 | |||
33 | struct nsm_args { | ||
34 | struct nsm_private *priv; | ||
35 | u32 prog; /* RPC callback info */ | ||
36 | u32 vers; | ||
37 | u32 proc; | ||
20 | 38 | ||
21 | #define XDR_ADDRBUF_LEN (20) | 39 | char *mon_name; |
40 | }; | ||
22 | 41 | ||
23 | static struct rpc_clnt * nsm_create(void); | 42 | struct nsm_res { |
43 | u32 status; | ||
44 | u32 state; | ||
45 | }; | ||
24 | 46 | ||
25 | static struct rpc_program nsm_program; | 47 | static struct rpc_program nsm_program; |
48 | static LIST_HEAD(nsm_handles); | ||
49 | static DEFINE_SPINLOCK(nsm_lock); | ||
26 | 50 | ||
27 | /* | 51 | /* |
28 | * Local NSM state | 52 | * Local NSM state |
29 | */ | 53 | */ |
30 | int nsm_local_state; | 54 | int __read_mostly nsm_local_state; |
55 | int __read_mostly nsm_use_hostnames; | ||
31 | 56 | ||
32 | /* | 57 | static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) |
33 | * Common procedure for SM_MON/SM_UNMON calls | 58 | { |
34 | */ | 59 | return (struct sockaddr *)&nsm->sm_addr; |
35 | static int | 60 | } |
36 | nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) | 61 | |
62 | static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, | ||
63 | const size_t len) | ||
64 | { | ||
65 | const struct sockaddr_in *sin = (struct sockaddr_in *)sap; | ||
66 | snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); | ||
67 | } | ||
68 | |||
69 | static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, | ||
70 | const size_t len) | ||
71 | { | ||
72 | const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; | ||
73 | |||
74 | if (ipv6_addr_v4mapped(&sin6->sin6_addr)) | ||
75 | snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); | ||
76 | else if (sin6->sin6_scope_id != 0) | ||
77 | snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, | ||
78 | sin6->sin6_scope_id); | ||
79 | else | ||
80 | snprintf(buf, len, "%pI6", &sin6->sin6_addr); | ||
81 | } | ||
82 | |||
83 | static void nsm_display_address(const struct sockaddr *sap, | ||
84 | char *buf, const size_t len) | ||
85 | { | ||
86 | switch (sap->sa_family) { | ||
87 | case AF_INET: | ||
88 | nsm_display_ipv4_address(sap, buf, len); | ||
89 | break; | ||
90 | case AF_INET6: | ||
91 | nsm_display_ipv6_address(sap, buf, len); | ||
92 | break; | ||
93 | default: | ||
94 | snprintf(buf, len, "unsupported address family"); | ||
95 | break; | ||
96 | } | ||
97 | } | ||
98 | |||
99 | static struct rpc_clnt *nsm_create(void) | ||
100 | { | ||
101 | struct sockaddr_in sin = { | ||
102 | .sin_family = AF_INET, | ||
103 | .sin_addr.s_addr = htonl(INADDR_LOOPBACK), | ||
104 | }; | ||
105 | struct rpc_create_args args = { | ||
106 | .protocol = XPRT_TRANSPORT_UDP, | ||
107 | .address = (struct sockaddr *)&sin, | ||
108 | .addrsize = sizeof(sin), | ||
109 | .servername = "rpc.statd", | ||
110 | .program = &nsm_program, | ||
111 | .version = NSM_VERSION, | ||
112 | .authflavor = RPC_AUTH_NULL, | ||
113 | }; | ||
114 | |||
115 | return rpc_create(&args); | ||
116 | } | ||
117 | |||
118 | static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) | ||
37 | { | 119 | { |
38 | struct rpc_clnt *clnt; | 120 | struct rpc_clnt *clnt; |
39 | int status; | 121 | int status; |
40 | struct nsm_args args; | 122 | struct nsm_args args = { |
123 | .priv = &nsm->sm_priv, | ||
124 | .prog = NLM_PROGRAM, | ||
125 | .vers = 3, | ||
126 | .proc = NLMPROC_NSM_NOTIFY, | ||
127 | .mon_name = nsm->sm_mon_name, | ||
128 | }; | ||
41 | struct rpc_message msg = { | 129 | struct rpc_message msg = { |
42 | .rpc_argp = &args, | 130 | .rpc_argp = &args, |
43 | .rpc_resp = res, | 131 | .rpc_resp = res, |
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) | |||
46 | clnt = nsm_create(); | 134 | clnt = nsm_create(); |
47 | if (IS_ERR(clnt)) { | 135 | if (IS_ERR(clnt)) { |
48 | status = PTR_ERR(clnt); | 136 | status = PTR_ERR(clnt); |
137 | dprintk("lockd: failed to create NSM upcall transport, " | ||
138 | "status=%d\n", status); | ||
49 | goto out; | 139 | goto out; |
50 | } | 140 | } |
51 | 141 | ||
52 | memset(&args, 0, sizeof(args)); | ||
53 | args.mon_name = nsm->sm_name; | ||
54 | args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; | ||
55 | args.prog = NLM_PROGRAM; | ||
56 | args.vers = 3; | ||
57 | args.proc = NLMPROC_NSM_NOTIFY; | ||
58 | memset(res, 0, sizeof(*res)); | 142 | memset(res, 0, sizeof(*res)); |
59 | 143 | ||
60 | msg.rpc_proc = &clnt->cl_procinfo[proc]; | 144 | msg.rpc_proc = &clnt->cl_procinfo[proc]; |
61 | status = rpc_call_sync(clnt, &msg, 0); | 145 | status = rpc_call_sync(clnt, &msg, 0); |
62 | if (status < 0) | 146 | if (status < 0) |
63 | printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", | 147 | dprintk("lockd: NSM upcall RPC failed, status=%d\n", |
64 | status); | 148 | status); |
65 | else | 149 | else |
66 | status = 0; | 150 | status = 0; |
67 | rpc_shutdown_client(clnt); | 151 | rpc_shutdown_client(clnt); |
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) | |||
69 | return status; | 153 | return status; |
70 | } | 154 | } |
71 | 155 | ||
72 | /* | 156 | /** |
73 | * Set up monitoring of a remote host | 157 | * nsm_monitor - Notify a peer in case we reboot |
158 | * @host: pointer to nlm_host of peer to notify | ||
159 | * | ||
160 | * If this peer is not already monitored, this function sends an | ||
161 | * upcall to the local rpc.statd to record the name/address of | ||
162 | * the peer to notify in case we reboot. | ||
163 | * | ||
164 | * Returns zero if the peer is monitored by the local rpc.statd; | ||
165 | * otherwise a negative errno value is returned. | ||
74 | */ | 166 | */ |
75 | int | 167 | int nsm_monitor(const struct nlm_host *host) |
76 | nsm_monitor(struct nlm_host *host) | ||
77 | { | 168 | { |
78 | struct nsm_handle *nsm = host->h_nsmhandle; | 169 | struct nsm_handle *nsm = host->h_nsmhandle; |
79 | struct nsm_res res; | 170 | struct nsm_res res; |
80 | int status; | 171 | int status; |
81 | 172 | ||
82 | dprintk("lockd: nsm_monitor(%s)\n", host->h_name); | 173 | dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); |
83 | BUG_ON(nsm == NULL); | ||
84 | 174 | ||
85 | if (nsm->sm_monitored) | 175 | if (nsm->sm_monitored) |
86 | return 0; | 176 | return 0; |
87 | 177 | ||
88 | status = nsm_mon_unmon(nsm, SM_MON, &res); | 178 | /* |
179 | * Choose whether to record the caller_name or IP address of | ||
180 | * this peer in the local rpc.statd's database. | ||
181 | */ | ||
182 | nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; | ||
89 | 183 | ||
90 | if (status < 0 || res.status != 0) | 184 | status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); |
91 | printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); | 185 | if (res.status != 0) |
186 | status = -EIO; | ||
187 | if (status < 0) | ||
188 | printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); | ||
92 | else | 189 | else |
93 | nsm->sm_monitored = 1; | 190 | nsm->sm_monitored = 1; |
94 | return status; | 191 | return status; |
95 | } | 192 | } |
96 | 193 | ||
97 | /* | 194 | /** |
98 | * Cease to monitor remote host | 195 | * nsm_unmonitor - Unregister peer notification |
196 | * @host: pointer to nlm_host of peer to stop monitoring | ||
197 | * | ||
198 | * If this peer is monitored, this function sends an upcall to | ||
199 | * tell the local rpc.statd not to send this peer a notification | ||
200 | * when we reboot. | ||
99 | */ | 201 | */ |
100 | int | 202 | void nsm_unmonitor(const struct nlm_host *host) |
101 | nsm_unmonitor(struct nlm_host *host) | ||
102 | { | 203 | { |
103 | struct nsm_handle *nsm = host->h_nsmhandle; | 204 | struct nsm_handle *nsm = host->h_nsmhandle; |
104 | struct nsm_res res; | 205 | struct nsm_res res; |
105 | int status = 0; | 206 | int status; |
106 | |||
107 | if (nsm == NULL) | ||
108 | return 0; | ||
109 | host->h_nsmhandle = NULL; | ||
110 | 207 | ||
111 | if (atomic_read(&nsm->sm_count) == 1 | 208 | if (atomic_read(&nsm->sm_count) == 1 |
112 | && nsm->sm_monitored && !nsm->sm_sticky) { | 209 | && nsm->sm_monitored && !nsm->sm_sticky) { |
113 | dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); | 210 | dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); |
114 | 211 | ||
115 | status = nsm_mon_unmon(nsm, SM_UNMON, &res); | 212 | status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); |
213 | if (res.status != 0) | ||
214 | status = -EIO; | ||
116 | if (status < 0) | 215 | if (status < 0) |
117 | printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", | 216 | printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", |
118 | host->h_name); | 217 | nsm->sm_name); |
119 | else | 218 | else |
120 | nsm->sm_monitored = 0; | 219 | nsm->sm_monitored = 0; |
121 | } | 220 | } |
122 | nsm_release(nsm); | 221 | } |
123 | return status; | 222 | |
223 | static struct nsm_handle *nsm_lookup_hostname(const char *hostname, | ||
224 | const size_t len) | ||
225 | { | ||
226 | struct nsm_handle *nsm; | ||
227 | |||
228 | list_for_each_entry(nsm, &nsm_handles, sm_link) | ||
229 | if (strlen(nsm->sm_name) == len && | ||
230 | memcmp(nsm->sm_name, hostname, len) == 0) | ||
231 | return nsm; | ||
232 | return NULL; | ||
233 | } | ||
234 | |||
235 | static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) | ||
236 | { | ||
237 | struct nsm_handle *nsm; | ||
238 | |||
239 | list_for_each_entry(nsm, &nsm_handles, sm_link) | ||
240 | if (nlm_cmp_addr(nsm_addr(nsm), sap)) | ||
241 | return nsm; | ||
242 | return NULL; | ||
243 | } | ||
244 | |||
245 | static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) | ||
246 | { | ||
247 | struct nsm_handle *nsm; | ||
248 | |||
249 | list_for_each_entry(nsm, &nsm_handles, sm_link) | ||
250 | if (memcmp(nsm->sm_priv.data, priv->data, | ||
251 | sizeof(priv->data)) == 0) | ||
252 | return nsm; | ||
253 | return NULL; | ||
124 | } | 254 | } |
125 | 255 | ||
126 | /* | 256 | /* |
127 | * Create NSM client for the local host | 257 | * Construct a unique cookie to match this nsm_handle to this monitored |
258 | * host. It is passed to the local rpc.statd via NSMPROC_MON, and | ||
259 | * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these | ||
260 | * requests. | ||
261 | * | ||
262 | * The NSM protocol requires that these cookies be unique while the | ||
263 | * system is running. We prefer a stronger requirement of making them | ||
264 | * unique across reboots. If user space bugs cause a stale cookie to | ||
265 | * be sent to the kernel, it could cause the wrong host to lose its | ||
266 | * lock state if cookies were not unique across reboots. | ||
267 | * | ||
268 | * The cookies are exposed only to local user space via loopback. They | ||
269 | * do not appear on the physical network. If we want greater security | ||
270 | * for some reason, nsm_init_private() could perform a one-way hash to | ||
271 | * obscure the contents of the cookie. | ||
128 | */ | 272 | */ |
129 | static struct rpc_clnt * | 273 | static void nsm_init_private(struct nsm_handle *nsm) |
130 | nsm_create(void) | ||
131 | { | 274 | { |
132 | struct sockaddr_in sin = { | 275 | u64 *p = (u64 *)&nsm->sm_priv.data; |
133 | .sin_family = AF_INET, | 276 | struct timespec ts; |
134 | .sin_addr.s_addr = htonl(INADDR_LOOPBACK), | ||
135 | .sin_port = 0, | ||
136 | }; | ||
137 | struct rpc_create_args args = { | ||
138 | .protocol = XPRT_TRANSPORT_UDP, | ||
139 | .address = (struct sockaddr *)&sin, | ||
140 | .addrsize = sizeof(sin), | ||
141 | .servername = "localhost", | ||
142 | .program = &nsm_program, | ||
143 | .version = SM_VERSION, | ||
144 | .authflavor = RPC_AUTH_NULL, | ||
145 | }; | ||
146 | 277 | ||
147 | return rpc_create(&args); | 278 | ktime_get_ts(&ts); |
279 | *p++ = timespec_to_ns(&ts); | ||
280 | *p = (unsigned long)nsm; | ||
281 | } | ||
282 | |||
283 | static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, | ||
284 | const size_t salen, | ||
285 | const char *hostname, | ||
286 | const size_t hostname_len) | ||
287 | { | ||
288 | struct nsm_handle *new; | ||
289 | |||
290 | new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); | ||
291 | if (unlikely(new == NULL)) | ||
292 | return NULL; | ||
293 | |||
294 | atomic_set(&new->sm_count, 1); | ||
295 | new->sm_name = (char *)(new + 1); | ||
296 | memcpy(nsm_addr(new), sap, salen); | ||
297 | new->sm_addrlen = salen; | ||
298 | nsm_init_private(new); | ||
299 | nsm_display_address((const struct sockaddr *)&new->sm_addr, | ||
300 | new->sm_addrbuf, sizeof(new->sm_addrbuf)); | ||
301 | memcpy(new->sm_name, hostname, hostname_len); | ||
302 | new->sm_name[hostname_len] = '\0'; | ||
303 | |||
304 | return new; | ||
305 | } | ||
306 | |||
307 | /** | ||
308 | * nsm_get_handle - Find or create a cached nsm_handle | ||
309 | * @sap: pointer to socket address of handle to find | ||
310 | * @salen: length of socket address | ||
311 | * @hostname: pointer to C string containing hostname to find | ||
312 | * @hostname_len: length of C string | ||
313 | * | ||
314 | * Behavior is modulated by the global nsm_use_hostnames variable. | ||
315 | * | ||
316 | * Returns a cached nsm_handle after bumping its ref count, or | ||
317 | * returns a fresh nsm_handle if a handle that matches @sap and/or | ||
318 | * @hostname cannot be found in the handle cache. Returns NULL if | ||
319 | * an error occurs. | ||
320 | */ | ||
321 | struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, | ||
322 | const size_t salen, const char *hostname, | ||
323 | const size_t hostname_len) | ||
324 | { | ||
325 | struct nsm_handle *cached, *new = NULL; | ||
326 | |||
327 | if (hostname && memchr(hostname, '/', hostname_len) != NULL) { | ||
328 | if (printk_ratelimit()) { | ||
329 | printk(KERN_WARNING "Invalid hostname \"%.*s\" " | ||
330 | "in NFS lock request\n", | ||
331 | (int)hostname_len, hostname); | ||
332 | } | ||
333 | return NULL; | ||
334 | } | ||
335 | |||
336 | retry: | ||
337 | spin_lock(&nsm_lock); | ||
338 | |||
339 | if (nsm_use_hostnames && hostname != NULL) | ||
340 | cached = nsm_lookup_hostname(hostname, hostname_len); | ||
341 | else | ||
342 | cached = nsm_lookup_addr(sap); | ||
343 | |||
344 | if (cached != NULL) { | ||
345 | atomic_inc(&cached->sm_count); | ||
346 | spin_unlock(&nsm_lock); | ||
347 | kfree(new); | ||
348 | dprintk("lockd: found nsm_handle for %s (%s), " | ||
349 | "cnt %d\n", cached->sm_name, | ||
350 | cached->sm_addrbuf, | ||
351 | atomic_read(&cached->sm_count)); | ||
352 | return cached; | ||
353 | } | ||
354 | |||
355 | if (new != NULL) { | ||
356 | list_add(&new->sm_link, &nsm_handles); | ||
357 | spin_unlock(&nsm_lock); | ||
358 | dprintk("lockd: created nsm_handle for %s (%s)\n", | ||
359 | new->sm_name, new->sm_addrbuf); | ||
360 | return new; | ||
361 | } | ||
362 | |||
363 | spin_unlock(&nsm_lock); | ||
364 | |||
365 | new = nsm_create_handle(sap, salen, hostname, hostname_len); | ||
366 | if (unlikely(new == NULL)) | ||
367 | return NULL; | ||
368 | goto retry; | ||
369 | } | ||
370 | |||
371 | /** | ||
372 | * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle | ||
373 | * @info: pointer to NLMPROC_SM_NOTIFY arguments | ||
374 | * | ||
375 | * Returns a matching nsm_handle if found in the nsm cache; the returned | ||
376 | * nsm_handle's reference count is bumped and sm_monitored is cleared. | ||
377 | * Otherwise returns NULL if some error occurred. | ||
378 | */ | ||
379 | struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) | ||
380 | { | ||
381 | struct nsm_handle *cached; | ||
382 | |||
383 | spin_lock(&nsm_lock); | ||
384 | |||
385 | cached = nsm_lookup_priv(&info->priv); | ||
386 | if (unlikely(cached == NULL)) { | ||
387 | spin_unlock(&nsm_lock); | ||
388 | dprintk("lockd: never saw rebooted peer '%.*s' before\n", | ||
389 | info->len, info->mon); | ||
390 | return cached; | ||
391 | } | ||
392 | |||
393 | atomic_inc(&cached->sm_count); | ||
394 | spin_unlock(&nsm_lock); | ||
395 | |||
396 | /* | ||
397 | * During subsequent lock activity, force a fresh | ||
398 | * notification to be set up for this host. | ||
399 | */ | ||
400 | cached->sm_monitored = 0; | ||
401 | |||
402 | dprintk("lockd: host %s (%s) rebooted, cnt %d\n", | ||
403 | cached->sm_name, cached->sm_addrbuf, | ||
404 | atomic_read(&cached->sm_count)); | ||
405 | return cached; | ||
406 | } | ||
407 | |||
408 | /** | ||
409 | * nsm_release - Release an NSM handle | ||
410 | * @nsm: pointer to handle to be released | ||
411 | * | ||
412 | */ | ||
413 | void nsm_release(struct nsm_handle *nsm) | ||
414 | { | ||
415 | if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { | ||
416 | list_del(&nsm->sm_link); | ||
417 | spin_unlock(&nsm_lock); | ||
418 | dprintk("lockd: destroyed nsm_handle for %s (%s)\n", | ||
419 | nsm->sm_name, nsm->sm_addrbuf); | ||
420 | kfree(nsm); | ||
421 | } | ||
148 | } | 422 | } |
149 | 423 | ||
150 | /* | 424 | /* |
@@ -154,127 +428,132 @@ nsm_create(void) | |||
154 | * Status Monitor wire protocol. | 428 | * Status Monitor wire protocol. |
155 | */ | 429 | */ |
156 | 430 | ||
157 | static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) | 431 | static int encode_nsm_string(struct xdr_stream *xdr, const char *string) |
158 | { | 432 | { |
159 | size_t len = strlen(string); | 433 | const u32 len = strlen(string); |
160 | 434 | __be32 *p; | |
161 | if (len > SM_MAXSTRLEN) | 435 | |
162 | len = SM_MAXSTRLEN; | 436 | if (unlikely(len > SM_MAXSTRLEN)) |
163 | return xdr_encode_opaque(p, string, len); | 437 | return -EIO; |
438 | p = xdr_reserve_space(xdr, sizeof(u32) + len); | ||
439 | if (unlikely(p == NULL)) | ||
440 | return -EIO; | ||
441 | xdr_encode_opaque(p, string, len); | ||
442 | return 0; | ||
164 | } | 443 | } |
165 | 444 | ||
166 | /* | 445 | /* |
167 | * "mon_name" specifies the host to be monitored. | 446 | * "mon_name" specifies the host to be monitored. |
168 | * | ||
169 | * Linux uses a text version of the IP address of the remote | ||
170 | * host as the host identifier (the "mon_name" argument). | ||
171 | * | ||
172 | * Linux statd always looks up the canonical hostname first for | ||
173 | * whatever remote hostname it receives, so this works alright. | ||
174 | */ | 447 | */ |
175 | static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) | 448 | static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) |
176 | { | 449 | { |
177 | char buffer[XDR_ADDRBUF_LEN + 1]; | 450 | return encode_nsm_string(xdr, argp->mon_name); |
178 | char *name = argp->mon_name; | ||
179 | |||
180 | if (!nsm_use_hostnames) { | ||
181 | snprintf(buffer, XDR_ADDRBUF_LEN, | ||
182 | "%pI4", &argp->addr); | ||
183 | name = buffer; | ||
184 | } | ||
185 | |||
186 | return xdr_encode_nsm_string(p, name); | ||
187 | } | 451 | } |
188 | 452 | ||
189 | /* | 453 | /* |
190 | * The "my_id" argument specifies the hostname and RPC procedure | 454 | * The "my_id" argument specifies the hostname and RPC procedure |
191 | * to be called when the status manager receives notification | 455 | * to be called when the status manager receives notification |
192 | * (via the SM_NOTIFY call) that the state of host "mon_name" | 456 | * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" |
193 | * has changed. | 457 | * has changed. |
194 | */ | 458 | */ |
195 | static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) | 459 | static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) |
196 | { | 460 | { |
197 | p = xdr_encode_nsm_string(p, utsname()->nodename); | 461 | int status; |
198 | if (!p) | 462 | __be32 *p; |
199 | return ERR_PTR(-EIO); | 463 | |
200 | 464 | status = encode_nsm_string(xdr, utsname()->nodename); | |
465 | if (unlikely(status != 0)) | ||
466 | return status; | ||
467 | p = xdr_reserve_space(xdr, 3 * sizeof(u32)); | ||
468 | if (unlikely(p == NULL)) | ||
469 | return -EIO; | ||
201 | *p++ = htonl(argp->prog); | 470 | *p++ = htonl(argp->prog); |
202 | *p++ = htonl(argp->vers); | 471 | *p++ = htonl(argp->vers); |
203 | *p++ = htonl(argp->proc); | 472 | *p++ = htonl(argp->proc); |
204 | 473 | return 0; | |
205 | return p; | ||
206 | } | 474 | } |
207 | 475 | ||
208 | /* | 476 | /* |
209 | * The "mon_id" argument specifies the non-private arguments | 477 | * The "mon_id" argument specifies the non-private arguments |
210 | * of an SM_MON or SM_UNMON call. | 478 | * of an NSMPROC_MON or NSMPROC_UNMON call. |
211 | */ | 479 | */ |
212 | static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) | 480 | static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) |
213 | { | 481 | { |
214 | p = xdr_encode_mon_name(p, argp); | 482 | int status; |
215 | if (!p) | ||
216 | return ERR_PTR(-EIO); | ||
217 | 483 | ||
218 | return xdr_encode_my_id(p, argp); | 484 | status = encode_mon_name(xdr, argp); |
485 | if (unlikely(status != 0)) | ||
486 | return status; | ||
487 | return encode_my_id(xdr, argp); | ||
219 | } | 488 | } |
220 | 489 | ||
221 | /* | 490 | /* |
222 | * The "priv" argument may contain private information required | 491 | * The "priv" argument may contain private information required |
223 | * by the SM_MON call. This information will be supplied in the | 492 | * by the NSMPROC_MON call. This information will be supplied in the |
224 | * SM_NOTIFY call. | 493 | * NLMPROC_SM_NOTIFY call. |
225 | * | ||
226 | * Linux provides the raw IP address of the monitored host, | ||
227 | * left in network byte order. | ||
228 | */ | 494 | */ |
229 | static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) | 495 | static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) |
230 | { | 496 | { |
231 | *p++ = argp->addr; | 497 | __be32 *p; |
232 | *p++ = 0; | ||
233 | *p++ = 0; | ||
234 | *p++ = 0; | ||
235 | 498 | ||
236 | return p; | 499 | p = xdr_reserve_space(xdr, SM_PRIV_SIZE); |
500 | if (unlikely(p == NULL)) | ||
501 | return -EIO; | ||
502 | xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); | ||
503 | return 0; | ||
237 | } | 504 | } |
238 | 505 | ||
239 | static int | 506 | static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, |
240 | xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) | 507 | const struct nsm_args *argp) |
241 | { | 508 | { |
242 | p = xdr_encode_mon_id(p, argp); | 509 | struct xdr_stream xdr; |
243 | if (IS_ERR(p)) | 510 | int status; |
244 | return PTR_ERR(p); | 511 | |
245 | 512 | xdr_init_encode(&xdr, &req->rq_snd_buf, p); | |
246 | p = xdr_encode_priv(p, argp); | 513 | status = encode_mon_id(&xdr, argp); |
247 | if (IS_ERR(p)) | 514 | if (unlikely(status)) |
248 | return PTR_ERR(p); | 515 | return status; |
249 | 516 | return encode_priv(&xdr, argp); | |
250 | rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); | ||
251 | return 0; | ||
252 | } | 517 | } |
253 | 518 | ||
254 | static int | 519 | static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, |
255 | xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) | 520 | const struct nsm_args *argp) |
256 | { | 521 | { |
257 | p = xdr_encode_mon_id(p, argp); | 522 | struct xdr_stream xdr; |
258 | if (IS_ERR(p)) | 523 | |
259 | return PTR_ERR(p); | 524 | xdr_init_encode(&xdr, &req->rq_snd_buf, p); |
260 | rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); | 525 | return encode_mon_id(&xdr, argp); |
261 | return 0; | ||
262 | } | 526 | } |
263 | 527 | ||
264 | static int | 528 | static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, |
265 | xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) | 529 | struct nsm_res *resp) |
266 | { | 530 | { |
531 | struct xdr_stream xdr; | ||
532 | |||
533 | xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); | ||
534 | p = xdr_inline_decode(&xdr, 2 * sizeof(u32)); | ||
535 | if (unlikely(p == NULL)) | ||
536 | return -EIO; | ||
267 | resp->status = ntohl(*p++); | 537 | resp->status = ntohl(*p++); |
268 | resp->state = ntohl(*p++); | 538 | resp->state = ntohl(*p); |
269 | dprintk("nsm: xdr_decode_stat_res status %d state %d\n", | 539 | |
540 | dprintk("lockd: xdr_dec_stat_res status %d state %d\n", | ||
270 | resp->status, resp->state); | 541 | resp->status, resp->state); |
271 | return 0; | 542 | return 0; |
272 | } | 543 | } |
273 | 544 | ||
274 | static int | 545 | static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, |
275 | xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) | 546 | struct nsm_res *resp) |
276 | { | 547 | { |
277 | resp->state = ntohl(*p++); | 548 | struct xdr_stream xdr; |
549 | |||
550 | xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); | ||
551 | p = xdr_inline_decode(&xdr, sizeof(u32)); | ||
552 | if (unlikely(p == NULL)) | ||
553 | return -EIO; | ||
554 | resp->state = ntohl(*p); | ||
555 | |||
556 | dprintk("lockd: xdr_dec_stat state %d\n", resp->state); | ||
278 | return 0; | 557 | return 0; |
279 | } | 558 | } |
280 | 559 | ||
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) | |||
288 | #define SM_unmonres_sz 1 | 567 | #define SM_unmonres_sz 1 |
289 | 568 | ||
290 | static struct rpc_procinfo nsm_procedures[] = { | 569 | static struct rpc_procinfo nsm_procedures[] = { |
291 | [SM_MON] = { | 570 | [NSMPROC_MON] = { |
292 | .p_proc = SM_MON, | 571 | .p_proc = NSMPROC_MON, |
293 | .p_encode = (kxdrproc_t) xdr_encode_mon, | 572 | .p_encode = (kxdrproc_t)xdr_enc_mon, |
294 | .p_decode = (kxdrproc_t) xdr_decode_stat_res, | 573 | .p_decode = (kxdrproc_t)xdr_dec_stat_res, |
295 | .p_arglen = SM_mon_sz, | 574 | .p_arglen = SM_mon_sz, |
296 | .p_replen = SM_monres_sz, | 575 | .p_replen = SM_monres_sz, |
297 | .p_statidx = SM_MON, | 576 | .p_statidx = NSMPROC_MON, |
298 | .p_name = "MONITOR", | 577 | .p_name = "MONITOR", |
299 | }, | 578 | }, |
300 | [SM_UNMON] = { | 579 | [NSMPROC_UNMON] = { |
301 | .p_proc = SM_UNMON, | 580 | .p_proc = NSMPROC_UNMON, |
302 | .p_encode = (kxdrproc_t) xdr_encode_unmon, | 581 | .p_encode = (kxdrproc_t)xdr_enc_unmon, |
303 | .p_decode = (kxdrproc_t) xdr_decode_stat, | 582 | .p_decode = (kxdrproc_t)xdr_dec_stat, |
304 | .p_arglen = SM_mon_id_sz, | 583 | .p_arglen = SM_mon_id_sz, |
305 | .p_replen = SM_unmonres_sz, | 584 | .p_replen = SM_unmonres_sz, |
306 | .p_statidx = SM_UNMON, | 585 | .p_statidx = NSMPROC_UNMON, |
307 | .p_name = "UNMONITOR", | 586 | .p_name = "UNMONITOR", |
308 | }, | 587 | }, |
309 | }; | 588 | }; |
@@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats; | |||
322 | 601 | ||
323 | static struct rpc_program nsm_program = { | 602 | static struct rpc_program nsm_program = { |
324 | .name = "statd", | 603 | .name = "statd", |
325 | .number = SM_PROGRAM, | 604 | .number = NSM_PROGRAM, |
326 | .nrvers = ARRAY_SIZE(nsm_version), | 605 | .nrvers = ARRAY_SIZE(nsm_version), |
327 | .version = nsm_version, | 606 | .version = nsm_version, |
328 | .stats = &nsm_stats | 607 | .stats = &nsm_stats |
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 252d80163d02..64f1c31b5853 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/sunrpc/svcsock.h> | 35 | #include <linux/sunrpc/svcsock.h> |
36 | #include <net/ip.h> | 36 | #include <net/ip.h> |
37 | #include <linux/lockd/lockd.h> | 37 | #include <linux/lockd/lockd.h> |
38 | #include <linux/lockd/sm_inter.h> | ||
39 | #include <linux/nfs.h> | 38 | #include <linux/nfs.h> |
40 | 39 | ||
41 | #define NLMDBG_FACILITY NLMDBG_SVC | 40 | #define NLMDBG_FACILITY NLMDBG_SVC |
@@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst; | |||
54 | unsigned long nlmsvc_timeout; | 53 | unsigned long nlmsvc_timeout; |
55 | 54 | ||
56 | /* | 55 | /* |
56 | * If the kernel has IPv6 support available, always listen for | ||
57 | * both AF_INET and AF_INET6 requests. | ||
58 | */ | ||
59 | #if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ | ||
60 | defined(CONFIG_SUNRPC_REGISTER_V4) | ||
61 | static const sa_family_t nlmsvc_family = AF_INET6; | ||
62 | #else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ | ||
63 | static const sa_family_t nlmsvc_family = AF_INET; | ||
64 | #endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ | ||
65 | |||
66 | /* | ||
57 | * These can be set at insmod time (useful for NFS as root filesystem), | 67 | * These can be set at insmod time (useful for NFS as root filesystem), |
58 | * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 | 68 | * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 |
59 | */ | 69 | */ |
60 | static unsigned long nlm_grace_period; | 70 | static unsigned long nlm_grace_period; |
61 | static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; | 71 | static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; |
62 | static int nlm_udpport, nlm_tcpport; | 72 | static int nlm_udpport, nlm_tcpport; |
63 | int nsm_use_hostnames = 0; | 73 | |
74 | /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ | ||
75 | static unsigned int nlm_max_connections = 1024; | ||
64 | 76 | ||
65 | /* | 77 | /* |
66 | * Constants needed for the sysctl interface. | 78 | * Constants needed for the sysctl interface. |
@@ -143,6 +155,9 @@ lockd(void *vrqstp) | |||
143 | long timeout = MAX_SCHEDULE_TIMEOUT; | 155 | long timeout = MAX_SCHEDULE_TIMEOUT; |
144 | RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); | 156 | RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); |
145 | 157 | ||
158 | /* update sv_maxconn if it has changed */ | ||
159 | rqstp->rq_server->sv_maxconn = nlm_max_connections; | ||
160 | |||
146 | if (signalled()) { | 161 | if (signalled()) { |
147 | flush_signals(current); | 162 | flush_signals(current); |
148 | if (nlmsvc_ops) { | 163 | if (nlmsvc_ops) { |
@@ -189,6 +204,19 @@ lockd(void *vrqstp) | |||
189 | return 0; | 204 | return 0; |
190 | } | 205 | } |
191 | 206 | ||
207 | static int create_lockd_listener(struct svc_serv *serv, char *name, | ||
208 | unsigned short port) | ||
209 | { | ||
210 | struct svc_xprt *xprt; | ||
211 | |||
212 | xprt = svc_find_xprt(serv, name, 0, 0); | ||
213 | if (xprt == NULL) | ||
214 | return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); | ||
215 | |||
216 | svc_xprt_put(xprt); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
192 | /* | 220 | /* |
193 | * Ensure there are active UDP and TCP listeners for lockd. | 221 | * Ensure there are active UDP and TCP listeners for lockd. |
194 | * | 222 | * |
@@ -202,29 +230,23 @@ lockd(void *vrqstp) | |||
202 | static int make_socks(struct svc_serv *serv) | 230 | static int make_socks(struct svc_serv *serv) |
203 | { | 231 | { |
204 | static int warned; | 232 | static int warned; |
205 | struct svc_xprt *xprt; | 233 | int err; |
206 | int err = 0; | ||
207 | 234 | ||
208 | xprt = svc_find_xprt(serv, "udp", 0, 0); | 235 | err = create_lockd_listener(serv, "udp", nlm_udpport); |
209 | if (!xprt) | 236 | if (err < 0) |
210 | err = svc_create_xprt(serv, "udp", nlm_udpport, | 237 | goto out_err; |
211 | SVC_SOCK_DEFAULTS); | 238 | |
212 | else | 239 | err = create_lockd_listener(serv, "tcp", nlm_tcpport); |
213 | svc_xprt_put(xprt); | 240 | if (err < 0) |
214 | if (err >= 0) { | 241 | goto out_err; |
215 | xprt = svc_find_xprt(serv, "tcp", 0, 0); | 242 | |
216 | if (!xprt) | 243 | warned = 0; |
217 | err = svc_create_xprt(serv, "tcp", nlm_tcpport, | 244 | return 0; |
218 | SVC_SOCK_DEFAULTS); | 245 | |
219 | else | 246 | out_err: |
220 | svc_xprt_put(xprt); | 247 | if (warned++ == 0) |
221 | } | ||
222 | if (err >= 0) { | ||
223 | warned = 0; | ||
224 | err = 0; | ||
225 | } else if (warned++ == 0) | ||
226 | printk(KERN_WARNING | 248 | printk(KERN_WARNING |
227 | "lockd_up: makesock failed, error=%d\n", err); | 249 | "lockd_up: makesock failed, error=%d\n", err); |
228 | return err; | 250 | return err; |
229 | } | 251 | } |
230 | 252 | ||
@@ -252,7 +274,7 @@ int lockd_up(void) | |||
252 | "lockd_up: no pid, %d users??\n", nlmsvc_users); | 274 | "lockd_up: no pid, %d users??\n", nlmsvc_users); |
253 | 275 | ||
254 | error = -ENOMEM; | 276 | error = -ENOMEM; |
255 | serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); | 277 | serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); |
256 | if (!serv) { | 278 | if (!serv) { |
257 | printk(KERN_WARNING "lockd_up: create service failed\n"); | 279 | printk(KERN_WARNING "lockd_up: create service failed\n"); |
258 | goto out; | 280 | goto out; |
@@ -276,6 +298,7 @@ int lockd_up(void) | |||
276 | } | 298 | } |
277 | 299 | ||
278 | svc_sock_update_bufs(serv); | 300 | svc_sock_update_bufs(serv); |
301 | serv->sv_maxconn = nlm_max_connections; | ||
279 | 302 | ||
280 | nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); | 303 | nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); |
281 | if (IS_ERR(nlmsvc_task)) { | 304 | if (IS_ERR(nlmsvc_task)) { |
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int, | |||
485 | module_param_call(nlm_tcpport, param_set_port, param_get_int, | 508 | module_param_call(nlm_tcpport, param_set_port, param_get_int, |
486 | &nlm_tcpport, 0644); | 509 | &nlm_tcpport, 0644); |
487 | module_param(nsm_use_hostnames, bool, 0644); | 510 | module_param(nsm_use_hostnames, bool, 0644); |
511 | module_param(nlm_max_connections, uint, 0644); | ||
488 | 512 | ||
489 | /* | 513 | /* |
490 | * Initialising and terminating the module. | 514 | * Initialising and terminating the module. |
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4dfdcbc6bf68..1725037374c5 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c | |||
@@ -16,8 +16,6 @@ | |||
16 | #include <linux/nfsd/nfsd.h> | 16 | #include <linux/nfsd/nfsd.h> |
17 | #include <linux/lockd/lockd.h> | 17 | #include <linux/lockd/lockd.h> |
18 | #include <linux/lockd/share.h> | 18 | #include <linux/lockd/share.h> |
19 | #include <linux/lockd/sm_inter.h> | ||
20 | |||
21 | 19 | ||
22 | #define NLMDBG_FACILITY NLMDBG_CLIENT | 20 | #define NLMDBG_FACILITY NLMDBG_CLIENT |
23 | 21 | ||
@@ -419,8 +417,6 @@ static __be32 | |||
419 | nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, | 417 | nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, |
420 | void *resp) | 418 | void *resp) |
421 | { | 419 | { |
422 | struct sockaddr_in saddr; | ||
423 | |||
424 | dprintk("lockd: SM_NOTIFY called\n"); | 420 | dprintk("lockd: SM_NOTIFY called\n"); |
425 | 421 | ||
426 | if (!nlm_privileged_requester(rqstp)) { | 422 | if (!nlm_privileged_requester(rqstp)) { |
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, | |||
430 | return rpc_system_err; | 426 | return rpc_system_err; |
431 | } | 427 | } |
432 | 428 | ||
433 | /* Obtain the host pointer for this NFS server and try to | 429 | nlm_host_rebooted(argp); |
434 | * reclaim all locks we hold on this server. | ||
435 | */ | ||
436 | memset(&saddr, 0, sizeof(saddr)); | ||
437 | saddr.sin_family = AF_INET; | ||
438 | saddr.sin_addr.s_addr = argp->addr; | ||
439 | nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); | ||
440 | |||
441 | return rpc_success; | 430 | return rpc_success; |
442 | } | 431 | } |
443 | 432 | ||
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3ca89e2a9381..3688e55901fc 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c | |||
@@ -16,8 +16,6 @@ | |||
16 | #include <linux/nfsd/nfsd.h> | 16 | #include <linux/nfsd/nfsd.h> |
17 | #include <linux/lockd/lockd.h> | 17 | #include <linux/lockd/lockd.h> |
18 | #include <linux/lockd/share.h> | 18 | #include <linux/lockd/share.h> |
19 | #include <linux/lockd/sm_inter.h> | ||
20 | |||
21 | 19 | ||
22 | #define NLMDBG_FACILITY NLMDBG_CLIENT | 20 | #define NLMDBG_FACILITY NLMDBG_CLIENT |
23 | 21 | ||
@@ -451,8 +449,6 @@ static __be32 | |||
451 | nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, | 449 | nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, |
452 | void *resp) | 450 | void *resp) |
453 | { | 451 | { |
454 | struct sockaddr_in saddr; | ||
455 | |||
456 | dprintk("lockd: SM_NOTIFY called\n"); | 452 | dprintk("lockd: SM_NOTIFY called\n"); |
457 | 453 | ||
458 | if (!nlm_privileged_requester(rqstp)) { | 454 | if (!nlm_privileged_requester(rqstp)) { |
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, | |||
462 | return rpc_system_err; | 458 | return rpc_system_err; |
463 | } | 459 | } |
464 | 460 | ||
465 | /* Obtain the host pointer for this NFS server and try to | 461 | nlm_host_rebooted(argp); |
466 | * reclaim all locks we hold on this server. | ||
467 | */ | ||
468 | memset(&saddr, 0, sizeof(saddr)); | ||
469 | saddr.sin_family = AF_INET; | ||
470 | saddr.sin_addr.s_addr = argp->addr; | ||
471 | nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); | ||
472 | |||
473 | return rpc_success; | 462 | return rpc_success; |
474 | } | 463 | } |
475 | 464 | ||
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 34c2766e27c7..9e4d6aab611b 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/nfsd/export.h> | 17 | #include <linux/nfsd/export.h> |
18 | #include <linux/lockd/lockd.h> | 18 | #include <linux/lockd/lockd.h> |
19 | #include <linux/lockd/share.h> | 19 | #include <linux/lockd/share.h> |
20 | #include <linux/lockd/sm_inter.h> | ||
21 | #include <linux/module.h> | 20 | #include <linux/module.h> |
22 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
23 | 22 | ||
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 1f226290c67c..0336f2beacde 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/sunrpc/svc.h> | 16 | #include <linux/sunrpc/svc.h> |
17 | #include <linux/sunrpc/stats.h> | 17 | #include <linux/sunrpc/stats.h> |
18 | #include <linux/lockd/lockd.h> | 18 | #include <linux/lockd/lockd.h> |
19 | #include <linux/lockd/sm_inter.h> | ||
20 | 19 | ||
21 | #define NLMDBG_FACILITY NLMDBG_XDR | 20 | #define NLMDBG_FACILITY NLMDBG_XDR |
22 | 21 | ||
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) | |||
349 | if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) | 348 | if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) |
350 | return 0; | 349 | return 0; |
351 | argp->state = ntohl(*p++); | 350 | argp->state = ntohl(*p++); |
352 | /* Preserve the address in network byte order */ | 351 | memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); |
353 | argp->addr = *p++; | 352 | p += XDR_QUADLEN(SM_PRIV_SIZE); |
354 | return xdr_argsize_check(rqstp, p); | 353 | return xdr_argsize_check(rqstp, p); |
355 | } | 354 | } |
356 | 355 | ||
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 50c493a8ad8e..e1d528653192 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/sunrpc/svc.h> | 17 | #include <linux/sunrpc/svc.h> |
18 | #include <linux/sunrpc/stats.h> | 18 | #include <linux/sunrpc/stats.h> |
19 | #include <linux/lockd/lockd.h> | 19 | #include <linux/lockd/lockd.h> |
20 | #include <linux/lockd/sm_inter.h> | ||
21 | 20 | ||
22 | #define NLMDBG_FACILITY NLMDBG_XDR | 21 | #define NLMDBG_FACILITY NLMDBG_XDR |
23 | 22 | ||
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp | |||
356 | if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) | 355 | if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) |
357 | return 0; | 356 | return 0; |
358 | argp->state = ntohl(*p++); | 357 | argp->state = ntohl(*p++); |
359 | /* Preserve the address in network byte order */ | 358 | memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); |
360 | argp->addr = *p++; | 359 | p += XDR_QUADLEN(SM_PRIV_SIZE); |
361 | return xdr_argsize_check(rqstp, p); | 360 | return xdr_argsize_check(rqstp, p); |
362 | } | 361 | } |
363 | 362 | ||
diff --git a/fs/minix/dir.c b/fs/minix/dir.c index f70433816a38..d4946c4c90e2 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c | |||
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) | |||
280 | return -EINVAL; | 280 | return -EINVAL; |
281 | 281 | ||
282 | got_it: | 282 | got_it: |
283 | pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); | 283 | pos = page_offset(page) + p - (char *)page_address(page); |
284 | err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, | 284 | err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, |
285 | AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); | 285 | AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); |
286 | if (err) | 286 | if (err) |
diff --git a/fs/mpage.c b/fs/mpage.c index 552b80b3facc..16c3ef37eae3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, | |||
241 | first_hole = page_block; | 241 | first_hole = page_block; |
242 | page_block++; | 242 | page_block++; |
243 | block_in_file++; | 243 | block_in_file++; |
244 | clear_buffer_mapped(map_bh); | ||
245 | continue; | 244 | continue; |
246 | } | 245 | } |
247 | 246 | ||
@@ -308,7 +307,10 @@ alloc_new: | |||
308 | goto alloc_new; | 307 | goto alloc_new; |
309 | } | 308 | } |
310 | 309 | ||
311 | if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) | 310 | relative_block = block_in_file - *first_logical_block; |
311 | nblocks = map_bh->b_size >> blkbits; | ||
312 | if ((buffer_boundary(map_bh) && relative_block == nblocks) || | ||
313 | (first_hole != blocks_per_page)) | ||
312 | bio = mpage_bio_submit(READ, bio); | 314 | bio = mpage_bio_submit(READ, bio); |
313 | else | 315 | else |
314 | *last_block_in_bio = blocks[blocks_per_page - 1]; | 316 | *last_block_in_bio = blocks[blocks_per_page - 1]; |
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 335b003dddf9..0af3349de851 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c | |||
@@ -16,7 +16,6 @@ | |||
16 | * @opts: an array of &struct option entries controlling parser operations | 16 | * @opts: an array of &struct option entries controlling parser operations |
17 | * @optopt: output; will contain the current option | 17 | * @optopt: output; will contain the current option |
18 | * @optarg: output; will contain the value (if one exists) | 18 | * @optarg: output; will contain the value (if one exists) |
19 | * @flag: output; may be NULL; should point to a long for or'ing flags | ||
20 | * @value: output; may be NULL; will be overwritten with the integer value | 19 | * @value: output; may be NULL; will be overwritten with the integer value |
21 | * of the current argument. | 20 | * of the current argument. |
22 | * | 21 | * |
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 6d04e050c74e..f54360f50a9c 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c | |||
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl | |||
98 | { | 98 | { |
99 | s32 auth_type; | 99 | s32 auth_type; |
100 | u32 object_name_len; | 100 | u32 object_name_len; |
101 | compat_caddr_t object_name; /* an userspace data, in most cases user name */ | 101 | compat_caddr_t object_name; /* a userspace data, in most cases user name */ |
102 | }; | 102 | }; |
103 | 103 | ||
104 | struct compat_ncp_fs_info_v2 { | 104 | struct compat_ncp_fs_info_v2 { |
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 0184fe9b514c..c903e04aa217 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c | |||
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) | |||
76 | 76 | ||
77 | ret = set_groups(new, gi); | 77 | ret = set_groups(new, gi); |
78 | put_group_info(gi); | 78 | put_group_info(gi); |
79 | if (!ret) | 79 | if (ret < 0) |
80 | goto error; | 80 | goto error; |
81 | 81 | ||
82 | if (new->uid) | 82 | if (new->fsuid) |
83 | new->cap_effective = cap_drop_nfsd_set(new->cap_effective); | 83 | new->cap_effective = cap_drop_nfsd_set(new->cap_effective); |
84 | else | 84 | else |
85 | new->cap_effective = cap_raise_nfsd_set(new->cap_effective, | 85 | new->cap_effective = cap_raise_nfsd_set(new->cap_effective, |
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6d7d8c02c197..c464181b5994 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c | |||
@@ -53,9 +53,6 @@ | |||
53 | #define NFSPROC4_CB_NULL 0 | 53 | #define NFSPROC4_CB_NULL 0 |
54 | #define NFSPROC4_CB_COMPOUND 1 | 54 | #define NFSPROC4_CB_COMPOUND 1 |
55 | 55 | ||
56 | /* declarations */ | ||
57 | static const struct rpc_call_ops nfs4_cb_null_ops; | ||
58 | |||
59 | /* Index of predefined Linux callback client operations */ | 56 | /* Index of predefined Linux callback client operations */ |
60 | 57 | ||
61 | enum { | 58 | enum { |
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 669461e291ae..9fa60a3ad48c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c | |||
@@ -946,6 +946,11 @@ encode_op: | |||
946 | nfsd4_encode_operation(resp, op); | 946 | nfsd4_encode_operation(resp, op); |
947 | status = op->status; | 947 | status = op->status; |
948 | } | 948 | } |
949 | |||
950 | dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", | ||
951 | args->ops, args->opcnt, resp->opcnt, op->opnum, | ||
952 | be32_to_cpu(status)); | ||
953 | |||
949 | if (cstate->replay_owner) { | 954 | if (cstate->replay_owner) { |
950 | nfs4_put_stateowner(cstate->replay_owner); | 955 | nfs4_put_stateowner(cstate->replay_owner); |
951 | cstate->replay_owner = NULL; | 956 | cstate->replay_owner = NULL; |
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 0f9d6efaa62b..74f7b67567fd 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c | |||
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) | |||
116 | 116 | ||
117 | md5_to_hex(dname, cksum.data); | 117 | md5_to_hex(dname, cksum.data); |
118 | 118 | ||
119 | kfree(cksum.data); | ||
120 | status = nfs_ok; | 119 | status = nfs_ok; |
121 | out: | 120 | out: |
121 | kfree(cksum.data); | ||
122 | crypto_free_hash(desc.tfm); | 122 | crypto_free_hash(desc.tfm); |
123 | out_no_tfm: | 123 | out_no_tfm: |
124 | return status; | 124 | return status; |
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 13e0e074dbb8..88db7d3ec120 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c | |||
@@ -2416,6 +2416,26 @@ out: | |||
2416 | #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) | 2416 | #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) |
2417 | #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) | 2417 | #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) |
2418 | 2418 | ||
2419 | static inline u64 | ||
2420 | end_offset(u64 start, u64 len) | ||
2421 | { | ||
2422 | u64 end; | ||
2423 | |||
2424 | end = start + len; | ||
2425 | return end >= start ? end: NFS4_MAX_UINT64; | ||
2426 | } | ||
2427 | |||
2428 | /* last octet in a range */ | ||
2429 | static inline u64 | ||
2430 | last_byte_offset(u64 start, u64 len) | ||
2431 | { | ||
2432 | u64 end; | ||
2433 | |||
2434 | BUG_ON(!len); | ||
2435 | end = start + len; | ||
2436 | return end > start ? end - 1: NFS4_MAX_UINT64; | ||
2437 | } | ||
2438 | |||
2419 | #define lockownerid_hashval(id) \ | 2439 | #define lockownerid_hashval(id) \ |
2420 | ((id) & LOCK_HASH_MASK) | 2440 | ((id) & LOCK_HASH_MASK) |
2421 | 2441 | ||
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; | |||
2435 | static struct nfs4_stateid * | 2455 | static struct nfs4_stateid * |
2436 | find_stateid(stateid_t *stid, int flags) | 2456 | find_stateid(stateid_t *stid, int flags) |
2437 | { | 2457 | { |
2438 | struct nfs4_stateid *local = NULL; | 2458 | struct nfs4_stateid *local; |
2439 | u32 st_id = stid->si_stateownerid; | 2459 | u32 st_id = stid->si_stateownerid; |
2440 | u32 f_id = stid->si_fileid; | 2460 | u32 f_id = stid->si_fileid; |
2441 | unsigned int hashval; | 2461 | unsigned int hashval; |
2442 | 2462 | ||
2443 | dprintk("NFSD: find_stateid flags 0x%x\n",flags); | 2463 | dprintk("NFSD: find_stateid flags 0x%x\n",flags); |
2444 | if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { | 2464 | if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { |
2445 | hashval = stateid_hashval(st_id, f_id); | 2465 | hashval = stateid_hashval(st_id, f_id); |
2446 | list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { | 2466 | list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { |
2447 | if ((local->st_stateid.si_stateownerid == st_id) && | 2467 | if ((local->st_stateid.si_stateownerid == st_id) && |
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags) | |||
2449 | return local; | 2469 | return local; |
2450 | } | 2470 | } |
2451 | } | 2471 | } |
2452 | if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { | 2472 | |
2473 | if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { | ||
2453 | hashval = stateid_hashval(st_id, f_id); | 2474 | hashval = stateid_hashval(st_id, f_id); |
2454 | list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { | 2475 | list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { |
2455 | if ((local->st_stateid.si_stateownerid == st_id) && | 2476 | if ((local->st_stateid.si_stateownerid == st_id) && |
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) | |||
2518 | deny->ld_clientid.cl_id = 0; | 2539 | deny->ld_clientid.cl_id = 0; |
2519 | } | 2540 | } |
2520 | deny->ld_start = fl->fl_start; | 2541 | deny->ld_start = fl->fl_start; |
2521 | deny->ld_length = ~(u64)0; | 2542 | deny->ld_length = NFS4_MAX_UINT64; |
2522 | if (fl->fl_end != ~(u64)0) | 2543 | if (fl->fl_end != NFS4_MAX_UINT64) |
2523 | deny->ld_length = fl->fl_end - fl->fl_start + 1; | 2544 | deny->ld_length = fl->fl_end - fl->fl_start + 1; |
2524 | deny->ld_type = NFS4_READ_LT; | 2545 | deny->ld_type = NFS4_READ_LT; |
2525 | if (fl->fl_type != F_RDLCK) | 2546 | if (fl->fl_type != F_RDLCK) |
@@ -2616,7 +2637,7 @@ out: | |||
2616 | static int | 2637 | static int |
2617 | check_lock_length(u64 offset, u64 length) | 2638 | check_lock_length(u64 offset, u64 length) |
2618 | { | 2639 | { |
2619 | return ((length == 0) || ((length != ~(u64)0) && | 2640 | return ((length == 0) || ((length != NFS4_MAX_UINT64) && |
2620 | LOFF_OVERFLOW(offset, length))); | 2641 | LOFF_OVERFLOW(offset, length))); |
2621 | } | 2642 | } |
2622 | 2643 | ||
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, | |||
2736 | file_lock.fl_lmops = &nfsd_posix_mng_ops; | 2757 | file_lock.fl_lmops = &nfsd_posix_mng_ops; |
2737 | 2758 | ||
2738 | file_lock.fl_start = lock->lk_offset; | 2759 | file_lock.fl_start = lock->lk_offset; |
2739 | if ((lock->lk_length == ~(u64)0) || | 2760 | file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); |
2740 | LOFF_OVERFLOW(lock->lk_offset, lock->lk_length)) | ||
2741 | file_lock.fl_end = ~(u64)0; | ||
2742 | else | ||
2743 | file_lock.fl_end = lock->lk_offset + lock->lk_length - 1; | ||
2744 | nfs4_transform_lock_offset(&file_lock); | 2761 | nfs4_transform_lock_offset(&file_lock); |
2745 | 2762 | ||
2746 | /* | 2763 | /* |
@@ -2781,6 +2798,25 @@ out: | |||
2781 | } | 2798 | } |
2782 | 2799 | ||
2783 | /* | 2800 | /* |
2801 | * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, | ||
2802 | * so we do a temporary open here just to get an open file to pass to | ||
2803 | * vfs_test_lock. (Arguably perhaps test_lock should be done with an | ||
2804 | * inode operation.) | ||
2805 | */ | ||
2806 | static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) | ||
2807 | { | ||
2808 | struct file *file; | ||
2809 | int err; | ||
2810 | |||
2811 | err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); | ||
2812 | if (err) | ||
2813 | return err; | ||
2814 | err = vfs_test_lock(file, lock); | ||
2815 | nfsd_close(file); | ||
2816 | return err; | ||
2817 | } | ||
2818 | |||
2819 | /* | ||
2784 | * LOCKT operation | 2820 | * LOCKT operation |
2785 | */ | 2821 | */ |
2786 | __be32 | 2822 | __be32 |
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, | |||
2788 | struct nfsd4_lockt *lockt) | 2824 | struct nfsd4_lockt *lockt) |
2789 | { | 2825 | { |
2790 | struct inode *inode; | 2826 | struct inode *inode; |
2791 | struct file file; | ||
2792 | struct file_lock file_lock; | 2827 | struct file_lock file_lock; |
2793 | int error; | 2828 | int error; |
2794 | __be32 status; | 2829 | __be32 status; |
@@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, | |||
2839 | file_lock.fl_lmops = &nfsd_posix_mng_ops; | 2874 | file_lock.fl_lmops = &nfsd_posix_mng_ops; |
2840 | 2875 | ||
2841 | file_lock.fl_start = lockt->lt_offset; | 2876 | file_lock.fl_start = lockt->lt_offset; |
2842 | if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) | 2877 | file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); |
2843 | file_lock.fl_end = ~(u64)0; | ||
2844 | else | ||
2845 | file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1; | ||
2846 | 2878 | ||
2847 | nfs4_transform_lock_offset(&file_lock); | 2879 | nfs4_transform_lock_offset(&file_lock); |
2848 | 2880 | ||
2849 | /* vfs_test_lock uses the struct file _only_ to resolve the inode. | ||
2850 | * since LOCKT doesn't require an OPEN, and therefore a struct | ||
2851 | * file may not exist, pass vfs_test_lock a struct file with | ||
2852 | * only the dentry:inode set. | ||
2853 | */ | ||
2854 | memset(&file, 0, sizeof (struct file)); | ||
2855 | file.f_path.dentry = cstate->current_fh.fh_dentry; | ||
2856 | |||
2857 | status = nfs_ok; | 2881 | status = nfs_ok; |
2858 | error = vfs_test_lock(&file, &file_lock); | 2882 | error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); |
2859 | if (error) { | 2883 | if (error) { |
2860 | status = nfserrno(error); | 2884 | status = nfserrno(error); |
2861 | goto out; | 2885 | goto out; |
@@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, | |||
2906 | file_lock.fl_lmops = &nfsd_posix_mng_ops; | 2930 | file_lock.fl_lmops = &nfsd_posix_mng_ops; |
2907 | file_lock.fl_start = locku->lu_offset; | 2931 | file_lock.fl_start = locku->lu_offset; |
2908 | 2932 | ||
2909 | if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) | 2933 | file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); |
2910 | file_lock.fl_end = ~(u64)0; | ||
2911 | else | ||
2912 | file_lock.fl_end = locku->lu_offset + locku->lu_length - 1; | ||
2913 | nfs4_transform_lock_offset(&file_lock); | 2934 | nfs4_transform_lock_offset(&file_lock); |
2914 | 2935 | ||
2915 | /* | 2936 | /* |
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index afcdf4b76843..f65953be39c0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c | |||
@@ -1,6 +1,4 @@ | |||
1 | /* | 1 | /* |
2 | * fs/nfs/nfs4xdr.c | ||
3 | * | ||
4 | * Server-side XDR for NFSv4 | 2 | * Server-side XDR for NFSv4 |
5 | * | 3 | * |
6 | * Copyright (c) 2002 The Regents of the University of Michigan. | 4 | * Copyright (c) 2002 The Regents of the University of Michigan. |
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 77d7b8c531a6..3d93b2064ce5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c | |||
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size); | |||
84 | static ssize_t write_getfd(struct file *file, char *buf, size_t size); | 84 | static ssize_t write_getfd(struct file *file, char *buf, size_t size); |
85 | static ssize_t write_getfs(struct file *file, char *buf, size_t size); | 85 | static ssize_t write_getfs(struct file *file, char *buf, size_t size); |
86 | static ssize_t write_filehandle(struct file *file, char *buf, size_t size); | 86 | static ssize_t write_filehandle(struct file *file, char *buf, size_t size); |
87 | static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); | ||
88 | static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); | ||
87 | static ssize_t write_threads(struct file *file, char *buf, size_t size); | 89 | static ssize_t write_threads(struct file *file, char *buf, size_t size); |
88 | static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); | 90 | static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); |
89 | static ssize_t write_versions(struct file *file, char *buf, size_t size); | 91 | static ssize_t write_versions(struct file *file, char *buf, size_t size); |
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size); | |||
94 | static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); | 96 | static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); |
95 | #endif | 97 | #endif |
96 | 98 | ||
97 | static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size); | ||
98 | static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size); | ||
99 | |||
100 | static ssize_t (*write_op[])(struct file *, char *, size_t) = { | 99 | static ssize_t (*write_op[])(struct file *, char *, size_t) = { |
101 | [NFSD_Svc] = write_svc, | 100 | [NFSD_Svc] = write_svc, |
102 | [NFSD_Add] = write_add, | 101 | [NFSD_Add] = write_add, |
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { | |||
106 | [NFSD_Getfd] = write_getfd, | 105 | [NFSD_Getfd] = write_getfd, |
107 | [NFSD_Getfs] = write_getfs, | 106 | [NFSD_Getfs] = write_getfs, |
108 | [NFSD_Fh] = write_filehandle, | 107 | [NFSD_Fh] = write_filehandle, |
109 | [NFSD_FO_UnlockIP] = failover_unlock_ip, | 108 | [NFSD_FO_UnlockIP] = write_unlock_ip, |
110 | [NFSD_FO_UnlockFS] = failover_unlock_fs, | 109 | [NFSD_FO_UnlockFS] = write_unlock_fs, |
111 | [NFSD_Threads] = write_threads, | 110 | [NFSD_Threads] = write_threads, |
112 | [NFSD_Pool_Threads] = write_pool_threads, | 111 | [NFSD_Pool_Threads] = write_pool_threads, |
113 | [NFSD_Versions] = write_versions, | 112 | [NFSD_Versions] = write_versions, |
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = { | |||
176 | /*----------------------------------------------------------------------------*/ | 175 | /*----------------------------------------------------------------------------*/ |
177 | /* | 176 | /* |
178 | * payload - write methods | 177 | * payload - write methods |
179 | * If the method has a response, the response should be put in buf, | ||
180 | * and the length returned. Otherwise return 0 or and -error. | ||
181 | */ | 178 | */ |
182 | 179 | ||
180 | /** | ||
181 | * write_svc - Start kernel's NFSD server | ||
182 | * | ||
183 | * Deprecated. /proc/fs/nfsd/threads is preferred. | ||
184 | * Function remains to support old versions of nfs-utils. | ||
185 | * | ||
186 | * Input: | ||
187 | * buf: struct nfsctl_svc | ||
188 | * svc_port: port number of this | ||
189 | * server's listener | ||
190 | * svc_nthreads: number of threads to start | ||
191 | * size: size in bytes of passed in nfsctl_svc | ||
192 | * Output: | ||
193 | * On success: returns zero | ||
194 | * On error: return code is negative errno value | ||
195 | */ | ||
183 | static ssize_t write_svc(struct file *file, char *buf, size_t size) | 196 | static ssize_t write_svc(struct file *file, char *buf, size_t size) |
184 | { | 197 | { |
185 | struct nfsctl_svc *data; | 198 | struct nfsctl_svc *data; |
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size) | |||
189 | return nfsd_svc(data->svc_port, data->svc_nthreads); | 202 | return nfsd_svc(data->svc_port, data->svc_nthreads); |
190 | } | 203 | } |
191 | 204 | ||
205 | /** | ||
206 | * write_add - Add or modify client entry in auth unix cache | ||
207 | * | ||
208 | * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. | ||
209 | * Function remains to support old versions of nfs-utils. | ||
210 | * | ||
211 | * Input: | ||
212 | * buf: struct nfsctl_client | ||
213 | * cl_ident: '\0'-terminated C string | ||
214 | * containing domain name | ||
215 | * of client | ||
216 | * cl_naddr: no. of items in cl_addrlist | ||
217 | * cl_addrlist: array of client addresses | ||
218 | * cl_fhkeytype: ignored | ||
219 | * cl_fhkeylen: ignored | ||
220 | * cl_fhkey: ignored | ||
221 | * size: size in bytes of passed in nfsctl_client | ||
222 | * Output: | ||
223 | * On success: returns zero | ||
224 | * On error: return code is negative errno value | ||
225 | * | ||
226 | * Note: Only AF_INET client addresses are passed in, since | ||
227 | * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. | ||
228 | */ | ||
192 | static ssize_t write_add(struct file *file, char *buf, size_t size) | 229 | static ssize_t write_add(struct file *file, char *buf, size_t size) |
193 | { | 230 | { |
194 | struct nfsctl_client *data; | 231 | struct nfsctl_client *data; |
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size) | |||
198 | return exp_addclient(data); | 235 | return exp_addclient(data); |
199 | } | 236 | } |
200 | 237 | ||
238 | /** | ||
239 | * write_del - Remove client from auth unix cache | ||
240 | * | ||
241 | * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. | ||
242 | * Function remains to support old versions of nfs-utils. | ||
243 | * | ||
244 | * Input: | ||
245 | * buf: struct nfsctl_client | ||
246 | * cl_ident: '\0'-terminated C string | ||
247 | * containing domain name | ||
248 | * of client | ||
249 | * cl_naddr: ignored | ||
250 | * cl_addrlist: ignored | ||
251 | * cl_fhkeytype: ignored | ||
252 | * cl_fhkeylen: ignored | ||
253 | * cl_fhkey: ignored | ||
254 | * size: size in bytes of passed in nfsctl_client | ||
255 | * Output: | ||
256 | * On success: returns zero | ||
257 | * On error: return code is negative errno value | ||
258 | * | ||
259 | * Note: Only AF_INET client addresses are passed in, since | ||
260 | * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. | ||
261 | */ | ||
201 | static ssize_t write_del(struct file *file, char *buf, size_t size) | 262 | static ssize_t write_del(struct file *file, char *buf, size_t size) |
202 | { | 263 | { |
203 | struct nfsctl_client *data; | 264 | struct nfsctl_client *data; |
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size) | |||
207 | return exp_delclient(data); | 268 | return exp_delclient(data); |
208 | } | 269 | } |
209 | 270 | ||
271 | /** | ||
272 | * write_export - Export part or all of a local file system | ||
273 | * | ||
274 | * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. | ||
275 | * Function remains to support old versions of nfs-utils. | ||
276 | * | ||
277 | * Input: | ||
278 | * buf: struct nfsctl_export | ||
279 | * ex_client: '\0'-terminated C string | ||
280 | * containing domain name | ||
281 | * of client allowed to access | ||
282 | * this export | ||
283 | * ex_path: '\0'-terminated C string | ||
284 | * containing pathname of | ||
285 | * directory in local file system | ||
286 | * ex_dev: fsid to use for this export | ||
287 | * ex_ino: ignored | ||
288 | * ex_flags: export flags for this export | ||
289 | * ex_anon_uid: UID to use for anonymous | ||
290 | * requests | ||
291 | * ex_anon_gid: GID to use for anonymous | ||
292 | * requests | ||
293 | * size: size in bytes of passed in nfsctl_export | ||
294 | * Output: | ||
295 | * On success: returns zero | ||
296 | * On error: return code is negative errno value | ||
297 | */ | ||
210 | static ssize_t write_export(struct file *file, char *buf, size_t size) | 298 | static ssize_t write_export(struct file *file, char *buf, size_t size) |
211 | { | 299 | { |
212 | struct nfsctl_export *data; | 300 | struct nfsctl_export *data; |
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size) | |||
216 | return exp_export(data); | 304 | return exp_export(data); |
217 | } | 305 | } |
218 | 306 | ||
307 | /** | ||
308 | * write_unexport - Unexport a previously exported file system | ||
309 | * | ||
310 | * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. | ||
311 | * Function remains to support old versions of nfs-utils. | ||
312 | * | ||
313 | * Input: | ||
314 | * buf: struct nfsctl_export | ||
315 | * ex_client: '\0'-terminated C string | ||
316 | * containing domain name | ||
317 | * of client no longer allowed | ||
318 | * to access this export | ||
319 | * ex_path: '\0'-terminated C string | ||
320 | * containing pathname of | ||
321 | * directory in local file system | ||
322 | * ex_dev: ignored | ||
323 | * ex_ino: ignored | ||
324 | * ex_flags: ignored | ||
325 | * ex_anon_uid: ignored | ||
326 | * ex_anon_gid: ignored | ||
327 | * size: size in bytes of passed in nfsctl_export | ||
328 | * Output: | ||
329 | * On success: returns zero | ||
330 | * On error: return code is negative errno value | ||
331 | */ | ||
219 | static ssize_t write_unexport(struct file *file, char *buf, size_t size) | 332 | static ssize_t write_unexport(struct file *file, char *buf, size_t size) |
220 | { | 333 | { |
221 | struct nfsctl_export *data; | 334 | struct nfsctl_export *data; |
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size) | |||
226 | return exp_unexport(data); | 339 | return exp_unexport(data); |
227 | } | 340 | } |
228 | 341 | ||
342 | /** | ||
343 | * write_getfs - Get a variable-length NFS file handle by path | ||
344 | * | ||
345 | * Deprecated. /proc/fs/nfsd/filehandle is preferred. | ||
346 | * Function remains to support old versions of nfs-utils. | ||
347 | * | ||
348 | * Input: | ||
349 | * buf: struct nfsctl_fsparm | ||
350 | * gd_addr: socket address of client | ||
351 | * gd_path: '\0'-terminated C string | ||
352 | * containing pathname of | ||
353 | * directory in local file system | ||
354 | * gd_maxlen: maximum size of returned file | ||
355 | * handle | ||
356 | * size: size in bytes of passed in nfsctl_fsparm | ||
357 | * Output: | ||
358 | * On success: passed-in buffer filled with a knfsd_fh structure | ||
359 | * (a variable-length raw NFS file handle); | ||
360 | * return code is the size in bytes of the file handle | ||
361 | * On error: return code is negative errno value | ||
362 | * | ||
363 | * Note: Only AF_INET client addresses are passed in, since gd_addr | ||
364 | * is the same size as a struct sockaddr_in. | ||
365 | */ | ||
229 | static ssize_t write_getfs(struct file *file, char *buf, size_t size) | 366 | static ssize_t write_getfs(struct file *file, char *buf, size_t size) |
230 | { | 367 | { |
231 | struct nfsctl_fsparm *data; | 368 | struct nfsctl_fsparm *data; |
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) | |||
265 | return err; | 402 | return err; |
266 | } | 403 | } |
267 | 404 | ||
405 | /** | ||
406 | * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) | ||
407 | * | ||
408 | * Deprecated. /proc/fs/nfsd/filehandle is preferred. | ||
409 | * Function remains to support old versions of nfs-utils. | ||
410 | * | ||
411 | * Input: | ||
412 | * buf: struct nfsctl_fdparm | ||
413 | * gd_addr: socket address of client | ||
414 | * gd_path: '\0'-terminated C string | ||
415 | * containing pathname of | ||
416 | * directory in local file system | ||
417 | * gd_version: fdparm structure version | ||
418 | * size: size in bytes of passed in nfsctl_fdparm | ||
419 | * Output: | ||
420 | * On success: passed-in buffer filled with nfsctl_res | ||
421 | * (a fixed-length raw NFS file handle); | ||
422 | * return code is the size in bytes of the file handle | ||
423 | * On error: return code is negative errno value | ||
424 | * | ||
425 | * Note: Only AF_INET client addresses are passed in, since gd_addr | ||
426 | * is the same size as a struct sockaddr_in. | ||
427 | */ | ||
268 | static ssize_t write_getfd(struct file *file, char *buf, size_t size) | 428 | static ssize_t write_getfd(struct file *file, char *buf, size_t size) |
269 | { | 429 | { |
270 | struct nfsctl_fdparm *data; | 430 | struct nfsctl_fdparm *data; |
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) | |||
309 | return err; | 469 | return err; |
310 | } | 470 | } |
311 | 471 | ||
312 | static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) | 472 | /** |
473 | * write_unlock_ip - Release all locks used by a client | ||
474 | * | ||
475 | * Experimental. | ||
476 | * | ||
477 | * Input: | ||
478 | * buf: '\n'-terminated C string containing a | ||
479 | * presentation format IPv4 address | ||
480 | * size: length of C string in @buf | ||
481 | * Output: | ||
482 | * On success: returns zero if all specified locks were released; | ||
483 | * returns one if one or more locks were not released | ||
484 | * On error: return code is negative errno value | ||
485 | * | ||
486 | * Note: Only AF_INET client addresses are passed in | ||
487 | */ | ||
488 | static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) | ||
313 | { | 489 | { |
314 | struct sockaddr_in sin = { | 490 | struct sockaddr_in sin = { |
315 | .sin_family = AF_INET, | 491 | .sin_family = AF_INET, |
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) | |||
339 | return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); | 515 | return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); |
340 | } | 516 | } |
341 | 517 | ||
342 | static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) | 518 | /** |
519 | * write_unlock_fs - Release all locks on a local file system | ||
520 | * | ||
521 | * Experimental. | ||
522 | * | ||
523 | * Input: | ||
524 | * buf: '\n'-terminated C string containing the | ||
525 | * absolute pathname of a local file system | ||
526 | * size: length of C string in @buf | ||
527 | * Output: | ||
528 | * On success: returns zero if all specified locks were released; | ||
529 | * returns one if one or more locks were not released | ||
530 | * On error: return code is negative errno value | ||
531 | */ | ||
532 | static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) | ||
343 | { | 533 | { |
344 | struct path path; | 534 | struct path path; |
345 | char *fo_path; | 535 | char *fo_path; |
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) | |||
360 | if (error) | 550 | if (error) |
361 | return error; | 551 | return error; |
362 | 552 | ||
553 | /* | ||
554 | * XXX: Needs better sanity checking. Otherwise we could end up | ||
555 | * releasing locks on the wrong file system. | ||
556 | * | ||
557 | * For example: | ||
558 | * 1. Does the path refer to a directory? | ||
559 | * 2. Is that directory a mount point, or | ||
560 | * 3. Is that directory the root of an exported file system? | ||
561 | */ | ||
363 | error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); | 562 | error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); |
364 | 563 | ||
365 | path_put(&path); | 564 | path_put(&path); |
366 | return error; | 565 | return error; |
367 | } | 566 | } |
368 | 567 | ||
568 | /** | ||
569 | * write_filehandle - Get a variable-length NFS file handle by path | ||
570 | * | ||
571 | * On input, the buffer contains a '\n'-terminated C string comprised of | ||
572 | * three alphanumeric words separated by whitespace. The string may | ||
573 | * contain escape sequences. | ||
574 | * | ||
575 | * Input: | ||
576 | * buf: | ||
577 | * domain: client domain name | ||
578 | * path: export pathname | ||
579 | * maxsize: numeric maximum size of | ||
580 | * @buf | ||
581 | * size: length of C string in @buf | ||
582 | * Output: | ||
583 | * On success: passed-in buffer filled with '\n'-terminated C | ||
584 | * string containing a ASCII hex text version | ||
585 | * of the NFS file handle; | ||
586 | * return code is the size in bytes of the string | ||
587 | * On error: return code is negative errno value | ||
588 | */ | ||
369 | static ssize_t write_filehandle(struct file *file, char *buf, size_t size) | 589 | static ssize_t write_filehandle(struct file *file, char *buf, size_t size) |
370 | { | 590 | { |
371 | /* request is: | ||
372 | * domain path maxsize | ||
373 | * response is | ||
374 | * filehandle | ||
375 | * | ||
376 | * qword quoting is used, so filehandle will be \x.... | ||
377 | */ | ||
378 | char *dname, *path; | 591 | char *dname, *path; |
379 | int uninitialized_var(maxsize); | 592 | int uninitialized_var(maxsize); |
380 | char *mesg = buf; | 593 | char *mesg = buf; |
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) | |||
391 | 604 | ||
392 | dname = mesg; | 605 | dname = mesg; |
393 | len = qword_get(&mesg, dname, size); | 606 | len = qword_get(&mesg, dname, size); |
394 | if (len <= 0) return -EINVAL; | 607 | if (len <= 0) |
608 | return -EINVAL; | ||
395 | 609 | ||
396 | path = dname+len+1; | 610 | path = dname+len+1; |
397 | len = qword_get(&mesg, path, size); | 611 | len = qword_get(&mesg, path, size); |
398 | if (len <= 0) return -EINVAL; | 612 | if (len <= 0) |
613 | return -EINVAL; | ||
399 | 614 | ||
400 | len = get_int(&mesg, &maxsize); | 615 | len = get_int(&mesg, &maxsize); |
401 | if (len) | 616 | if (len) |
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) | |||
419 | if (len) | 634 | if (len) |
420 | return len; | 635 | return len; |
421 | 636 | ||
422 | mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; | 637 | mesg = buf; |
638 | len = SIMPLE_TRANSACTION_LIMIT; | ||
423 | qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); | 639 | qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); |
424 | mesg[-1] = '\n'; | 640 | mesg[-1] = '\n'; |
425 | return mesg - buf; | 641 | return mesg - buf; |
426 | } | 642 | } |
427 | 643 | ||
644 | /** | ||
645 | * write_threads - Start NFSD, or report the current number of running threads | ||
646 | * | ||
647 | * Input: | ||
648 | * buf: ignored | ||
649 | * size: zero | ||
650 | * Output: | ||
651 | * On success: passed-in buffer filled with '\n'-terminated C | ||
652 | * string numeric value representing the number of | ||
653 | * running NFSD threads; | ||
654 | * return code is the size in bytes of the string | ||
655 | * On error: return code is zero | ||
656 | * | ||
657 | * OR | ||
658 | * | ||
659 | * Input: | ||
660 | * buf: C string containing an unsigned | ||
661 | * integer value representing the | ||
662 | * number of NFSD threads to start | ||
663 | * size: non-zero length of C string in @buf | ||
664 | * Output: | ||
665 | * On success: NFS service is started; | ||
666 | * passed-in buffer filled with '\n'-terminated C | ||
667 | * string numeric value representing the number of | ||
668 | * running NFSD threads; | ||
669 | * return code is the size in bytes of the string | ||
670 | * On error: return code is zero or a negative errno value | ||
671 | */ | ||
428 | static ssize_t write_threads(struct file *file, char *buf, size_t size) | 672 | static ssize_t write_threads(struct file *file, char *buf, size_t size) |
429 | { | 673 | { |
430 | /* if size > 0, look for a number of threads and call nfsd_svc | ||
431 | * then write out number of threads as reply | ||
432 | */ | ||
433 | char *mesg = buf; | 674 | char *mesg = buf; |
434 | int rv; | 675 | int rv; |
435 | if (size > 0) { | 676 | if (size > 0) { |
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) | |||
437 | rv = get_int(&mesg, &newthreads); | 678 | rv = get_int(&mesg, &newthreads); |
438 | if (rv) | 679 | if (rv) |
439 | return rv; | 680 | return rv; |
440 | if (newthreads <0) | 681 | if (newthreads < 0) |
441 | return -EINVAL; | 682 | return -EINVAL; |
442 | rv = nfsd_svc(2049, newthreads); | 683 | rv = nfsd_svc(NFS_PORT, newthreads); |
443 | if (rv) | 684 | if (rv) |
444 | return rv; | 685 | return rv; |
445 | } | 686 | } |
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) | |||
447 | return strlen(buf); | 688 | return strlen(buf); |
448 | } | 689 | } |
449 | 690 | ||
691 | /** | ||
692 | * write_pool_threads - Set or report the current number of threads per pool | ||
693 | * | ||
694 | * Input: | ||
695 | * buf: ignored | ||
696 | * size: zero | ||
697 | * | ||
698 | * OR | ||
699 | * | ||
700 | * Input: | ||
701 | * buf: C string containing whitespace- | ||
702 | * separated unsigned integer values | ||
703 | * representing the number of NFSD | ||
704 | * threads to start in each pool | ||
705 | * size: non-zero length of C string in @buf | ||
706 | * Output: | ||
707 | * On success: passed-in buffer filled with '\n'-terminated C | ||
708 | * string containing integer values representing the | ||
709 | * number of NFSD threads in each pool; | ||
710 | * return code is the size in bytes of the string | ||
711 | * On error: return code is zero or a negative errno value | ||
712 | */ | ||
450 | static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) | 713 | static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) |
451 | { | 714 | { |
452 | /* if size > 0, look for an array of number of threads per node | 715 | /* if size > 0, look for an array of number of threads per node |
@@ -517,10 +780,6 @@ out_free: | |||
517 | 780 | ||
518 | static ssize_t __write_versions(struct file *file, char *buf, size_t size) | 781 | static ssize_t __write_versions(struct file *file, char *buf, size_t size) |
519 | { | 782 | { |
520 | /* | ||
521 | * Format: | ||
522 | * [-/+]vers [-/+]vers ... | ||
523 | */ | ||
524 | char *mesg = buf; | 783 | char *mesg = buf; |
525 | char *vers, sign; | 784 | char *vers, sign; |
526 | int len, num; | 785 | int len, num; |
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) | |||
578 | return len; | 837 | return len; |
579 | } | 838 | } |
580 | 839 | ||
840 | /** | ||
841 | * write_versions - Set or report the available NFS protocol versions | ||
842 | * | ||
843 | * Input: | ||
844 | * buf: ignored | ||
845 | * size: zero | ||
846 | * Output: | ||
847 | * On success: passed-in buffer filled with '\n'-terminated C | ||
848 | * string containing positive or negative integer | ||
849 | * values representing the current status of each | ||
850 | * protocol version; | ||
851 | * return code is the size in bytes of the string | ||
852 | * On error: return code is zero or a negative errno value | ||
853 | * | ||
854 | * OR | ||
855 | * | ||
856 | * Input: | ||
857 | * buf: C string containing whitespace- | ||
858 | * separated positive or negative | ||
859 | * integer values representing NFS | ||
860 | * protocol versions to enable ("+n") | ||
861 | * or disable ("-n") | ||
862 | * size: non-zero length of C string in @buf | ||
863 | * Output: | ||
864 | * On success: status of zero or more protocol versions has | ||
865 | * been updated; passed-in buffer filled with | ||
866 | * '\n'-terminated C string containing positive | ||
867 | * or negative integer values representing the | ||
868 | * current status of each protocol version; | ||
869 | * return code is the size in bytes of the string | ||
870 | * On error: return code is zero or a negative errno value | ||
871 | */ | ||
581 | static ssize_t write_versions(struct file *file, char *buf, size_t size) | 872 | static ssize_t write_versions(struct file *file, char *buf, size_t size) |
582 | { | 873 | { |
583 | ssize_t rv; | 874 | ssize_t rv; |
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) | |||
687 | return -EINVAL; | 978 | return -EINVAL; |
688 | } | 979 | } |
689 | 980 | ||
981 | /** | ||
982 | * write_ports - Pass a socket file descriptor or transport name to listen on | ||
983 | * | ||
984 | * Input: | ||
985 | * buf: ignored | ||
986 | * size: zero | ||
987 | * Output: | ||
988 | * On success: passed-in buffer filled with a '\n'-terminated C | ||
989 | * string containing a whitespace-separated list of | ||
990 | * named NFSD listeners; | ||
991 | * return code is the size in bytes of the string | ||
992 | * On error: return code is zero or a negative errno value | ||
993 | * | ||
994 | * OR | ||
995 | * | ||
996 | * Input: | ||
997 | * buf: C string containing an unsigned | ||
998 | * integer value representing a bound | ||
999 | * but unconnected socket that is to be | ||
1000 | * used as an NFSD listener | ||
1001 | * size: non-zero length of C string in @buf | ||
1002 | * Output: | ||
1003 | * On success: NFS service is started; | ||
1004 | * passed-in buffer filled with a '\n'-terminated C | ||
1005 | * string containing a unique alphanumeric name of | ||
1006 | * the listener; | ||
1007 | * return code is the size in bytes of the string | ||
1008 | * On error: return code is a negative errno value | ||
1009 | * | ||
1010 | * OR | ||
1011 | * | ||
1012 | * Input: | ||
1013 | * buf: C string containing a "-" followed | ||
1014 | * by an integer value representing a | ||
1015 | * previously passed in socket file | ||
1016 | * descriptor | ||
1017 | * size: non-zero length of C string in @buf | ||
1018 | * Output: | ||
1019 | * On success: NFS service no longer listens on that socket; | ||
1020 | * passed-in buffer filled with a '\n'-terminated C | ||
1021 | * string containing a unique name of the listener; | ||
1022 | * return code is the size in bytes of the string | ||
1023 | * On error: return code is a negative errno value | ||
1024 | * | ||
1025 | * OR | ||
1026 | * | ||
1027 | * Input: | ||
1028 | * buf: C string containing a transport | ||
1029 | * name and an unsigned integer value | ||
1030 | * representing the port to listen on, | ||
1031 | * separated by whitespace | ||
1032 | * size: non-zero length of C string in @buf | ||
1033 | * Output: | ||
1034 | * On success: returns zero; NFS service is started | ||
1035 | * On error: return code is a negative errno value | ||
1036 | * | ||
1037 | * OR | ||
1038 | * | ||
1039 | * Input: | ||
1040 | * buf: C string containing a "-" followed | ||
1041 | * by a transport name and an unsigned | ||
1042 | * integer value representing the port | ||
1043 | * to listen on, separated by whitespace | ||
1044 | * size: non-zero length of C string in @buf | ||
1045 | * Output: | ||
1046 | * On success: returns zero; NFS service no longer listens | ||
1047 | * on that transport | ||
1048 | * On error: return code is a negative errno value | ||
1049 | */ | ||
690 | static ssize_t write_ports(struct file *file, char *buf, size_t size) | 1050 | static ssize_t write_ports(struct file *file, char *buf, size_t size) |
691 | { | 1051 | { |
692 | ssize_t rv; | 1052 | ssize_t rv; |
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) | |||
700 | 1060 | ||
701 | int nfsd_max_blksize; | 1061 | int nfsd_max_blksize; |
702 | 1062 | ||
1063 | /** | ||
1064 | * write_maxblksize - Set or report the current NFS blksize | ||
1065 | * | ||
1066 | * Input: | ||
1067 | * buf: ignored | ||
1068 | * size: zero | ||
1069 | * | ||
1070 | * OR | ||
1071 | * | ||
1072 | * Input: | ||
1073 | * buf: C string containing an unsigned | ||
1074 | * integer value representing the new | ||
1075 | * NFS blksize | ||
1076 | * size: non-zero length of C string in @buf | ||
1077 | * Output: | ||
1078 | * On success: passed-in buffer filled with '\n'-terminated C string | ||
1079 | * containing numeric value of the current NFS blksize | ||
1080 | * setting; | ||
1081 | * return code is the size in bytes of the string | ||
1082 | * On error: return code is zero or a negative errno value | ||
1083 | */ | ||
703 | static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) | 1084 | static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) |
704 | { | 1085 | { |
705 | char *mesg = buf; | 1086 | char *mesg = buf; |
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) | |||
752 | return strlen(buf); | 1133 | return strlen(buf); |
753 | } | 1134 | } |
754 | 1135 | ||
1136 | /** | ||
1137 | * write_leasetime - Set or report the current NFSv4 lease time | ||
1138 | * | ||
1139 | * Input: | ||
1140 | * buf: ignored | ||
1141 | * size: zero | ||
1142 | * | ||
1143 | * OR | ||
1144 | * | ||
1145 | * Input: | ||
1146 | * buf: C string containing an unsigned | ||
1147 | * integer value representing the new | ||
1148 | * NFSv4 lease expiry time | ||
1149 | * size: non-zero length of C string in @buf | ||
1150 | * Output: | ||
1151 | * On success: passed-in buffer filled with '\n'-terminated C | ||
1152 | * string containing unsigned integer value of the | ||
1153 | * current lease expiry time; | ||
1154 | * return code is the size in bytes of the string | ||
1155 | * On error: return code is zero or a negative errno value | ||
1156 | */ | ||
755 | static ssize_t write_leasetime(struct file *file, char *buf, size_t size) | 1157 | static ssize_t write_leasetime(struct file *file, char *buf, size_t size) |
756 | { | 1158 | { |
757 | ssize_t rv; | 1159 | ssize_t rv; |
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) | |||
788 | return strlen(buf); | 1190 | return strlen(buf); |
789 | } | 1191 | } |
790 | 1192 | ||
1193 | /** | ||
1194 | * write_recoverydir - Set or report the pathname of the recovery directory | ||
1195 | * | ||
1196 | * Input: | ||
1197 | * buf: ignored | ||
1198 | * size: zero | ||
1199 | * | ||
1200 | * OR | ||
1201 | * | ||
1202 | * Input: | ||
1203 | * buf: C string containing the pathname | ||
1204 | * of the directory on a local file | ||
1205 | * system containing permanent NFSv4 | ||
1206 | * recovery data | ||
1207 | * size: non-zero length of C string in @buf | ||
1208 | * Output: | ||
1209 | * On success: passed-in buffer filled with '\n'-terminated C string | ||
1210 | * containing the current recovery pathname setting; | ||
1211 | * return code is the size in bytes of the string | ||
1212 | * On error: return code is zero or a negative errno value | ||
1213 | */ | ||
791 | static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) | 1214 | static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) |
792 | { | 1215 | { |
793 | ssize_t rv; | 1216 | ssize_t rv; |
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f0da7d9c3a92..9f1ca17293d3 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c | |||
@@ -258,14 +258,32 @@ out: | |||
258 | return error; | 258 | return error; |
259 | } | 259 | } |
260 | 260 | ||
261 | /* | 261 | /** |
262 | * Perform sanity checks on the dentry in a client's file handle. | 262 | * fh_verify - filehandle lookup and access checking |
263 | * @rqstp: pointer to current rpc request | ||
264 | * @fhp: filehandle to be verified | ||
265 | * @type: expected type of object pointed to by filehandle | ||
266 | * @access: type of access needed to object | ||
267 | * | ||
268 | * Look up a dentry from the on-the-wire filehandle, check the client's | ||
269 | * access to the export, and set the current task's credentials. | ||
270 | * | ||
271 | * Regardless of success or failure of fh_verify(), fh_put() should be | ||
272 | * called on @fhp when the caller is finished with the filehandle. | ||
263 | * | 273 | * |
264 | * Note that the file handle dentry may need to be freed even after | 274 | * fh_verify() may be called multiple times on a given filehandle, for |
265 | * an error return. | 275 | * example, when processing an NFSv4 compound. The first call will look |
276 | * up a dentry using the on-the-wire filehandle. Subsequent calls will | ||
277 | * skip the lookup and just perform the other checks and possibly change | ||
278 | * the current task's credentials. | ||
266 | * | 279 | * |
267 | * This is only called at the start of an nfsproc call, so fhp points to | 280 | * @type specifies the type of object expected using one of the S_IF* |
268 | * a svc_fh which is all 0 except for the over-the-wire file handle. | 281 | * constants defined in include/linux/stat.h. The caller may use zero |
282 | * to indicate that it doesn't care, or a negative integer to indicate | ||
283 | * that it expects something not of the given type. | ||
284 | * | ||
285 | * @access is formed from the NFSD_MAY_* constants defined in | ||
286 | * include/linux/nfsd/nfsd.h. | ||
269 | */ | 287 | */ |
270 | __be32 | 288 | __be32 |
271 | fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) | 289 | fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) |
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, | |||
466 | goto retry; | 484 | goto retry; |
467 | break; | 485 | break; |
468 | } | 486 | } |
487 | } else if (exp->ex_flags & NFSEXP_FSID) { | ||
488 | fsid_type = FSID_NUM; | ||
469 | } else if (exp->ex_uuid) { | 489 | } else if (exp->ex_uuid) { |
470 | if (fhp->fh_maxsize >= 64) { | 490 | if (fhp->fh_maxsize >= 64) { |
471 | if (root_export) | 491 | if (root_export) |
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, | |||
478 | else | 498 | else |
479 | fsid_type = FSID_UUID4_INUM; | 499 | fsid_type = FSID_UUID4_INUM; |
480 | } | 500 | } |
481 | } else if (exp->ex_flags & NFSEXP_FSID) | 501 | } else if (!old_valid_dev(ex_dev)) |
482 | fsid_type = FSID_NUM; | ||
483 | else if (!old_valid_dev(ex_dev)) | ||
484 | /* for newer device numbers, we must use a newer fsid format */ | 502 | /* for newer device numbers, we must use a newer fsid format */ |
485 | fsid_type = FSID_ENCODE_DEV; | 503 | fsid_type = FSID_ENCODE_DEV; |
486 | else | 504 | else |
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 5cffeca7acef..6f7f26351227 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c | |||
@@ -622,6 +622,7 @@ nfserrno (int errno) | |||
622 | { nfserr_badname, -ESRCH }, | 622 | { nfserr_badname, -ESRCH }, |
623 | { nfserr_io, -ETXTBSY }, | 623 | { nfserr_io, -ETXTBSY }, |
624 | { nfserr_notsupp, -EOPNOTSUPP }, | 624 | { nfserr_notsupp, -EOPNOTSUPP }, |
625 | { nfserr_toosmall, -ETOOSMALL }, | ||
625 | }; | 626 | }; |
626 | int i; | 627 | int i; |
627 | 628 | ||
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 44aa92aba891..6e50aaa56ca2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c | |||
@@ -744,16 +744,44 @@ nfsd_close(struct file *filp) | |||
744 | fput(filp); | 744 | fput(filp); |
745 | } | 745 | } |
746 | 746 | ||
747 | /* | ||
748 | * Sync a file | ||
749 | * As this calls fsync (not fdatasync) there is no need for a write_inode | ||
750 | * after it. | ||
751 | */ | ||
752 | static inline int nfsd_dosync(struct file *filp, struct dentry *dp, | ||
753 | const struct file_operations *fop) | ||
754 | { | ||
755 | struct inode *inode = dp->d_inode; | ||
756 | int (*fsync) (struct file *, struct dentry *, int); | ||
757 | int err; | ||
758 | |||
759 | err = filemap_fdatawrite(inode->i_mapping); | ||
760 | if (err == 0 && fop && (fsync = fop->fsync)) | ||
761 | err = fsync(filp, dp, 0); | ||
762 | if (err == 0) | ||
763 | err = filemap_fdatawait(inode->i_mapping); | ||
764 | |||
765 | return err; | ||
766 | } | ||
767 | |||
747 | static int | 768 | static int |
748 | nfsd_sync(struct file *filp) | 769 | nfsd_sync(struct file *filp) |
749 | { | 770 | { |
750 | return vfs_fsync(filp, filp->f_path.dentry, 0); | 771 | int err; |
772 | struct inode *inode = filp->f_path.dentry->d_inode; | ||
773 | dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); | ||
774 | mutex_lock(&inode->i_mutex); | ||
775 | err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); | ||
776 | mutex_unlock(&inode->i_mutex); | ||
777 | |||
778 | return err; | ||
751 | } | 779 | } |
752 | 780 | ||
753 | int | 781 | int |
754 | nfsd_sync_dir(struct dentry *dentry) | 782 | nfsd_sync_dir(struct dentry *dp) |
755 | { | 783 | { |
756 | return vfs_fsync(NULL, dentry, 0); | 784 | return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); |
757 | } | 785 | } |
758 | 786 | ||
759 | /* | 787 | /* |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 54ff4c77aaa3..d861096c9d81 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -3868,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode, | |||
3868 | struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; | 3868 | struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; |
3869 | struct ocfs2_extent_rec *rec, *tmprec; | 3869 | struct ocfs2_extent_rec *rec, *tmprec; |
3870 | 3870 | ||
3871 | right_el = path_leaf_el(right_path);; | 3871 | right_el = path_leaf_el(right_path); |
3872 | if (left_path) | 3872 | if (left_path) |
3873 | left_el = path_leaf_el(left_path); | 3873 | left_el = path_leaf_el(left_path); |
3874 | 3874 | ||
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6ebaa58e2c03..04697ba7f73e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data) | |||
854 | 854 | ||
855 | while (!kthread_should_stop() && !reg->hr_unclean_stop) { | 855 | while (!kthread_should_stop() && !reg->hr_unclean_stop) { |
856 | /* We track the time spent inside | 856 | /* We track the time spent inside |
857 | * o2hb_do_disk_heartbeat so that we avoid more then | 857 | * o2hb_do_disk_heartbeat so that we avoid more than |
858 | * hr_timeout_ms between disk writes. On busy systems | 858 | * hr_timeout_ms between disk writes. On busy systems |
859 | * this should result in a heartbeat which is less | 859 | * this should result in a heartbeat which is less |
860 | * likely to time itself out. */ | 860 | * likely to time itself out. */ |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f731ab491795..b0c4cadd4c45 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -1324,7 +1324,7 @@ again: | |||
1324 | goto out; | 1324 | goto out; |
1325 | } | 1325 | } |
1326 | 1326 | ||
1327 | mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", | 1327 | mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", |
1328 | lockres->l_name); | 1328 | lockres->l_name); |
1329 | 1329 | ||
1330 | /* At this point we've gone inside the dlm and need to | 1330 | /* At this point we've gone inside the dlm and need to |
@@ -2951,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, | |||
2951 | ocfs2_dlm_dump_lksb(&lockres->l_lksb); | 2951 | ocfs2_dlm_dump_lksb(&lockres->l_lksb); |
2952 | BUG(); | 2952 | BUG(); |
2953 | } | 2953 | } |
2954 | mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", | 2954 | mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n", |
2955 | lockres->l_name); | 2955 | lockres->l_name); |
2956 | 2956 | ||
2957 | ocfs2_wait_on_busy_lock(lockres); | 2957 | ocfs2_wait_on_busy_lock(lockres); |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e8f795f978aa..a5887df2cd8a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1605,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, | |||
1605 | struct ocfs2_space_resv *sr) | 1605 | struct ocfs2_space_resv *sr) |
1606 | { | 1606 | { |
1607 | struct inode *inode = file->f_path.dentry->d_inode; | 1607 | struct inode *inode = file->f_path.dentry->d_inode; |
1608 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; | 1608 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1609 | 1609 | ||
1610 | if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && | 1610 | if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && |
1611 | !ocfs2_writes_unwritten_extents(osb)) | 1611 | !ocfs2_writes_unwritten_extents(osb)) |
diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d5b213b8a9b..6d720243f5f4 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c | |||
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno) | |||
334 | 334 | ||
335 | blk_free_devt(part_devt(part)); | 335 | blk_free_devt(part_devt(part)); |
336 | rcu_assign_pointer(ptbl->part[partno], NULL); | 336 | rcu_assign_pointer(ptbl->part[partno], NULL); |
337 | rcu_assign_pointer(ptbl->last_lookup, NULL); | ||
337 | kobject_put(part->holder_dir); | 338 | kobject_put(part->holder_dir); |
338 | device_del(part_to_dev(part)); | 339 | device_del(part_to_dev(part)); |
339 | 340 | ||
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, | |||
384 | 385 | ||
385 | dname = dev_name(ddev); | 386 | dname = dev_name(ddev); |
386 | if (isdigit(dname[strlen(dname) - 1])) | 387 | if (isdigit(dname[strlen(dname) - 1])) |
387 | snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); | 388 | dev_set_name(pdev, "%sp%d", dname, partno); |
388 | else | 389 | else |
389 | snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); | 390 | dev_set_name(pdev, "%s%d", dname, partno); |
390 | 391 | ||
391 | device_initialize(pdev); | 392 | device_initialize(pdev); |
392 | pdev->class = &block_class; | 393 | pdev->class = &block_class; |
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk) | |||
447 | struct block_device *bdev; | 448 | struct block_device *bdev; |
448 | struct disk_part_iter piter; | 449 | struct disk_part_iter piter; |
449 | struct hd_struct *part; | 450 | struct hd_struct *part; |
450 | char *s; | ||
451 | int err; | 451 | int err; |
452 | 452 | ||
453 | ddev->parent = disk->driverfs_dev; | 453 | ddev->parent = disk->driverfs_dev; |
454 | 454 | ||
455 | strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); | 455 | dev_set_name(ddev, disk->disk_name); |
456 | /* ewww... some of these buggers have / in the name... */ | ||
457 | s = strchr(ddev->bus_id, '/'); | ||
458 | if (s) | ||
459 | *s = '!'; | ||
460 | 456 | ||
461 | /* delay uevents, until we scanned partition table */ | 457 | /* delay uevents, until we scanned partition table */ |
462 | ddev->uevent_suppress = 1; | 458 | ddev->uevent_suppress = 1; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 10fd5223d600..0c9de19a1633 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -65,6 +65,7 @@ | |||
65 | #include <linux/mm.h> | 65 | #include <linux/mm.h> |
66 | #include <linux/rcupdate.h> | 66 | #include <linux/rcupdate.h> |
67 | #include <linux/kallsyms.h> | 67 | #include <linux/kallsyms.h> |
68 | #include <linux/stacktrace.h> | ||
68 | #include <linux/resource.h> | 69 | #include <linux/resource.h> |
69 | #include <linux/module.h> | 70 | #include <linux/module.h> |
70 | #include <linux/mount.h> | 71 | #include <linux/mount.h> |
@@ -109,25 +110,22 @@ struct pid_entry { | |||
109 | .op = OP, \ | 110 | .op = OP, \ |
110 | } | 111 | } |
111 | 112 | ||
112 | #define DIR(NAME, MODE, OTYPE) \ | 113 | #define DIR(NAME, MODE, iops, fops) \ |
113 | NOD(NAME, (S_IFDIR|(MODE)), \ | 114 | NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) |
114 | &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ | 115 | #define LNK(NAME, get_link) \ |
115 | {} ) | ||
116 | #define LNK(NAME, OTYPE) \ | ||
117 | NOD(NAME, (S_IFLNK|S_IRWXUGO), \ | 116 | NOD(NAME, (S_IFLNK|S_IRWXUGO), \ |
118 | &proc_pid_link_inode_operations, NULL, \ | 117 | &proc_pid_link_inode_operations, NULL, \ |
119 | { .proc_get_link = &proc_##OTYPE##_link } ) | 118 | { .proc_get_link = get_link } ) |
120 | #define REG(NAME, MODE, OTYPE) \ | 119 | #define REG(NAME, MODE, fops) \ |
121 | NOD(NAME, (S_IFREG|(MODE)), NULL, \ | 120 | NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) |
122 | &proc_##OTYPE##_operations, {}) | 121 | #define INF(NAME, MODE, read) \ |
123 | #define INF(NAME, MODE, OTYPE) \ | ||
124 | NOD(NAME, (S_IFREG|(MODE)), \ | 122 | NOD(NAME, (S_IFREG|(MODE)), \ |
125 | NULL, &proc_info_file_operations, \ | 123 | NULL, &proc_info_file_operations, \ |
126 | { .proc_read = &proc_##OTYPE } ) | 124 | { .proc_read = read } ) |
127 | #define ONE(NAME, MODE, OTYPE) \ | 125 | #define ONE(NAME, MODE, show) \ |
128 | NOD(NAME, (S_IFREG|(MODE)), \ | 126 | NOD(NAME, (S_IFREG|(MODE)), \ |
129 | NULL, &proc_single_file_operations, \ | 127 | NULL, &proc_single_file_operations, \ |
130 | { .proc_show = &proc_##OTYPE } ) | 128 | { .proc_show = show } ) |
131 | 129 | ||
132 | /* | 130 | /* |
133 | * Count the number of hardlinks for the pid_entry table, excluding the . | 131 | * Count the number of hardlinks for the pid_entry table, excluding the . |
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) | |||
308 | struct mm_struct *mm = get_task_mm(task); | 306 | struct mm_struct *mm = get_task_mm(task); |
309 | if (mm) { | 307 | if (mm) { |
310 | unsigned int nwords = 0; | 308 | unsigned int nwords = 0; |
311 | do | 309 | do { |
312 | nwords += 2; | 310 | nwords += 2; |
313 | while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ | 311 | } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ |
314 | res = nwords * sizeof(mm->saved_auxv[0]); | 312 | res = nwords * sizeof(mm->saved_auxv[0]); |
315 | if (res > PAGE_SIZE) | 313 | if (res > PAGE_SIZE) |
316 | res = PAGE_SIZE; | 314 | res = PAGE_SIZE; |
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) | |||
340 | } | 338 | } |
341 | #endif /* CONFIG_KALLSYMS */ | 339 | #endif /* CONFIG_KALLSYMS */ |
342 | 340 | ||
341 | #ifdef CONFIG_STACKTRACE | ||
342 | |||
343 | #define MAX_STACK_TRACE_DEPTH 64 | ||
344 | |||
345 | static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, | ||
346 | struct pid *pid, struct task_struct *task) | ||
347 | { | ||
348 | struct stack_trace trace; | ||
349 | unsigned long *entries; | ||
350 | int i; | ||
351 | |||
352 | entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); | ||
353 | if (!entries) | ||
354 | return -ENOMEM; | ||
355 | |||
356 | trace.nr_entries = 0; | ||
357 | trace.max_entries = MAX_STACK_TRACE_DEPTH; | ||
358 | trace.entries = entries; | ||
359 | trace.skip = 0; | ||
360 | save_stack_trace_tsk(task, &trace); | ||
361 | |||
362 | for (i = 0; i < trace.nr_entries; i++) { | ||
363 | seq_printf(m, "[<%p>] %pS\n", | ||
364 | (void *)entries[i], (void *)entries[i]); | ||
365 | } | ||
366 | kfree(entries); | ||
367 | |||
368 | return 0; | ||
369 | } | ||
370 | #endif | ||
371 | |||
343 | #ifdef CONFIG_SCHEDSTATS | 372 | #ifdef CONFIG_SCHEDSTATS |
344 | /* | 373 | /* |
345 | * Provides /proc/PID/schedstat | 374 | * Provides /proc/PID/schedstat |
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v) | |||
1186 | struct inode *inode = m->private; | 1215 | struct inode *inode = m->private; |
1187 | struct task_struct *p; | 1216 | struct task_struct *p; |
1188 | 1217 | ||
1189 | WARN_ON(!inode); | ||
1190 | |||
1191 | p = get_proc_task(inode); | 1218 | p = get_proc_task(inode); |
1192 | if (!p) | 1219 | if (!p) |
1193 | return -ESRCH; | 1220 | return -ESRCH; |
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf, | |||
1205 | struct inode *inode = file->f_path.dentry->d_inode; | 1232 | struct inode *inode = file->f_path.dentry->d_inode; |
1206 | struct task_struct *p; | 1233 | struct task_struct *p; |
1207 | 1234 | ||
1208 | WARN_ON(!inode); | ||
1209 | |||
1210 | p = get_proc_task(inode); | 1235 | p = get_proc_task(inode); |
1211 | if (!p) | 1236 | if (!p) |
1212 | return -ESRCH; | 1237 | return -ESRCH; |
@@ -1974,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, | |||
1974 | const struct pid_entry *ents, | 1999 | const struct pid_entry *ents, |
1975 | unsigned int nents) | 2000 | unsigned int nents) |
1976 | { | 2001 | { |
1977 | struct inode *inode; | ||
1978 | struct dentry *error; | 2002 | struct dentry *error; |
1979 | struct task_struct *task = get_proc_task(dir); | 2003 | struct task_struct *task = get_proc_task(dir); |
1980 | const struct pid_entry *p, *last; | 2004 | const struct pid_entry *p, *last; |
1981 | 2005 | ||
1982 | error = ERR_PTR(-ENOENT); | 2006 | error = ERR_PTR(-ENOENT); |
1983 | inode = NULL; | ||
1984 | 2007 | ||
1985 | if (!task) | 2008 | if (!task) |
1986 | goto out_no_task; | 2009 | goto out_no_task; |
@@ -2136,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = { | |||
2136 | }; | 2159 | }; |
2137 | 2160 | ||
2138 | static const struct pid_entry attr_dir_stuff[] = { | 2161 | static const struct pid_entry attr_dir_stuff[] = { |
2139 | REG("current", S_IRUGO|S_IWUGO, pid_attr), | 2162 | REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), |
2140 | REG("prev", S_IRUGO, pid_attr), | 2163 | REG("prev", S_IRUGO, proc_pid_attr_operations), |
2141 | REG("exec", S_IRUGO|S_IWUGO, pid_attr), | 2164 | REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), |
2142 | REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), | 2165 | REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), |
2143 | REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), | 2166 | REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), |
2144 | REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), | 2167 | REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), |
2145 | }; | 2168 | }; |
2146 | 2169 | ||
2147 | static int proc_attr_dir_readdir(struct file * filp, | 2170 | static int proc_attr_dir_readdir(struct file * filp, |
@@ -2461,74 +2484,77 @@ static const struct file_operations proc_task_operations; | |||
2461 | static const struct inode_operations proc_task_inode_operations; | 2484 | static const struct inode_operations proc_task_inode_operations; |
2462 | 2485 | ||
2463 | static const struct pid_entry tgid_base_stuff[] = { | 2486 | static const struct pid_entry tgid_base_stuff[] = { |
2464 | DIR("task", S_IRUGO|S_IXUGO, task), | 2487 | DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), |
2465 | DIR("fd", S_IRUSR|S_IXUSR, fd), | 2488 | DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), |
2466 | DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), | 2489 | DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), |
2467 | #ifdef CONFIG_NET | 2490 | #ifdef CONFIG_NET |
2468 | DIR("net", S_IRUGO|S_IXUGO, net), | 2491 | DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), |
2469 | #endif | 2492 | #endif |
2470 | REG("environ", S_IRUSR, environ), | 2493 | REG("environ", S_IRUSR, proc_environ_operations), |
2471 | INF("auxv", S_IRUSR, pid_auxv), | 2494 | INF("auxv", S_IRUSR, proc_pid_auxv), |
2472 | ONE("status", S_IRUGO, pid_status), | 2495 | ONE("status", S_IRUGO, proc_pid_status), |
2473 | ONE("personality", S_IRUSR, pid_personality), | 2496 | ONE("personality", S_IRUSR, proc_pid_personality), |
2474 | INF("limits", S_IRUSR, pid_limits), | 2497 | INF("limits", S_IRUSR, proc_pid_limits), |
2475 | #ifdef CONFIG_SCHED_DEBUG | 2498 | #ifdef CONFIG_SCHED_DEBUG |
2476 | REG("sched", S_IRUGO|S_IWUSR, pid_sched), | 2499 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), |
2477 | #endif | 2500 | #endif |
2478 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | 2501 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK |
2479 | INF("syscall", S_IRUSR, pid_syscall), | 2502 | INF("syscall", S_IRUSR, proc_pid_syscall), |
2480 | #endif | 2503 | #endif |
2481 | INF("cmdline", S_IRUGO, pid_cmdline), | 2504 | INF("cmdline", S_IRUGO, proc_pid_cmdline), |
2482 | ONE("stat", S_IRUGO, tgid_stat), | 2505 | ONE("stat", S_IRUGO, proc_tgid_stat), |
2483 | ONE("statm", S_IRUGO, pid_statm), | 2506 | ONE("statm", S_IRUGO, proc_pid_statm), |
2484 | REG("maps", S_IRUGO, maps), | 2507 | REG("maps", S_IRUGO, proc_maps_operations), |
2485 | #ifdef CONFIG_NUMA | 2508 | #ifdef CONFIG_NUMA |
2486 | REG("numa_maps", S_IRUGO, numa_maps), | 2509 | REG("numa_maps", S_IRUGO, proc_numa_maps_operations), |
2487 | #endif | 2510 | #endif |
2488 | REG("mem", S_IRUSR|S_IWUSR, mem), | 2511 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), |
2489 | LNK("cwd", cwd), | 2512 | LNK("cwd", proc_cwd_link), |
2490 | LNK("root", root), | 2513 | LNK("root", proc_root_link), |
2491 | LNK("exe", exe), | 2514 | LNK("exe", proc_exe_link), |
2492 | REG("mounts", S_IRUGO, mounts), | 2515 | REG("mounts", S_IRUGO, proc_mounts_operations), |
2493 | REG("mountinfo", S_IRUGO, mountinfo), | 2516 | REG("mountinfo", S_IRUGO, proc_mountinfo_operations), |
2494 | REG("mountstats", S_IRUSR, mountstats), | 2517 | REG("mountstats", S_IRUSR, proc_mountstats_operations), |
2495 | #ifdef CONFIG_PROC_PAGE_MONITOR | 2518 | #ifdef CONFIG_PROC_PAGE_MONITOR |
2496 | REG("clear_refs", S_IWUSR, clear_refs), | 2519 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
2497 | REG("smaps", S_IRUGO, smaps), | 2520 | REG("smaps", S_IRUGO, proc_smaps_operations), |
2498 | REG("pagemap", S_IRUSR, pagemap), | 2521 | REG("pagemap", S_IRUSR, proc_pagemap_operations), |
2499 | #endif | 2522 | #endif |
2500 | #ifdef CONFIG_SECURITY | 2523 | #ifdef CONFIG_SECURITY |
2501 | DIR("attr", S_IRUGO|S_IXUGO, attr_dir), | 2524 | DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), |
2502 | #endif | 2525 | #endif |
2503 | #ifdef CONFIG_KALLSYMS | 2526 | #ifdef CONFIG_KALLSYMS |
2504 | INF("wchan", S_IRUGO, pid_wchan), | 2527 | INF("wchan", S_IRUGO, proc_pid_wchan), |
2528 | #endif | ||
2529 | #ifdef CONFIG_STACKTRACE | ||
2530 | ONE("stack", S_IRUSR, proc_pid_stack), | ||
2505 | #endif | 2531 | #endif |
2506 | #ifdef CONFIG_SCHEDSTATS | 2532 | #ifdef CONFIG_SCHEDSTATS |
2507 | INF("schedstat", S_IRUGO, pid_schedstat), | 2533 | INF("schedstat", S_IRUGO, proc_pid_schedstat), |
2508 | #endif | 2534 | #endif |
2509 | #ifdef CONFIG_LATENCYTOP | 2535 | #ifdef CONFIG_LATENCYTOP |
2510 | REG("latency", S_IRUGO, lstats), | 2536 | REG("latency", S_IRUGO, proc_lstats_operations), |
2511 | #endif | 2537 | #endif |
2512 | #ifdef CONFIG_PROC_PID_CPUSET | 2538 | #ifdef CONFIG_PROC_PID_CPUSET |
2513 | REG("cpuset", S_IRUGO, cpuset), | 2539 | REG("cpuset", S_IRUGO, proc_cpuset_operations), |
2514 | #endif | 2540 | #endif |
2515 | #ifdef CONFIG_CGROUPS | 2541 | #ifdef CONFIG_CGROUPS |
2516 | REG("cgroup", S_IRUGO, cgroup), | 2542 | REG("cgroup", S_IRUGO, proc_cgroup_operations), |
2517 | #endif | 2543 | #endif |
2518 | INF("oom_score", S_IRUGO, oom_score), | 2544 | INF("oom_score", S_IRUGO, proc_oom_score), |
2519 | REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), | 2545 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), |
2520 | #ifdef CONFIG_AUDITSYSCALL | 2546 | #ifdef CONFIG_AUDITSYSCALL |
2521 | REG("loginuid", S_IWUSR|S_IRUGO, loginuid), | 2547 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
2522 | REG("sessionid", S_IRUGO, sessionid), | 2548 | REG("sessionid", S_IRUGO, proc_sessionid_operations), |
2523 | #endif | 2549 | #endif |
2524 | #ifdef CONFIG_FAULT_INJECTION | 2550 | #ifdef CONFIG_FAULT_INJECTION |
2525 | REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), | 2551 | REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), |
2526 | #endif | 2552 | #endif |
2527 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) | 2553 | #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) |
2528 | REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), | 2554 | REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), |
2529 | #endif | 2555 | #endif |
2530 | #ifdef CONFIG_TASK_IO_ACCOUNTING | 2556 | #ifdef CONFIG_TASK_IO_ACCOUNTING |
2531 | INF("io", S_IRUGO, tgid_io_accounting), | 2557 | INF("io", S_IRUGO, proc_tgid_io_accounting), |
2532 | #endif | 2558 | #endif |
2533 | }; | 2559 | }; |
2534 | 2560 | ||
@@ -2801,66 +2827,69 @@ out_no_task: | |||
2801 | * Tasks | 2827 | * Tasks |
2802 | */ | 2828 | */ |
2803 | static const struct pid_entry tid_base_stuff[] = { | 2829 | static const struct pid_entry tid_base_stuff[] = { |
2804 | DIR("fd", S_IRUSR|S_IXUSR, fd), | 2830 | DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), |
2805 | DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), | 2831 | DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), |
2806 | REG("environ", S_IRUSR, environ), | 2832 | REG("environ", S_IRUSR, proc_environ_operations), |
2807 | INF("auxv", S_IRUSR, pid_auxv), | 2833 | INF("auxv", S_IRUSR, proc_pid_auxv), |
2808 | ONE("status", S_IRUGO, pid_status), | 2834 | ONE("status", S_IRUGO, proc_pid_status), |
2809 | ONE("personality", S_IRUSR, pid_personality), | 2835 | ONE("personality", S_IRUSR, proc_pid_personality), |
2810 | INF("limits", S_IRUSR, pid_limits), | 2836 | INF("limits", S_IRUSR, proc_pid_limits), |
2811 | #ifdef CONFIG_SCHED_DEBUG | 2837 | #ifdef CONFIG_SCHED_DEBUG |
2812 | REG("sched", S_IRUGO|S_IWUSR, pid_sched), | 2838 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), |
2813 | #endif | 2839 | #endif |
2814 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | 2840 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK |
2815 | INF("syscall", S_IRUSR, pid_syscall), | 2841 | INF("syscall", S_IRUSR, proc_pid_syscall), |
2816 | #endif | 2842 | #endif |
2817 | INF("cmdline", S_IRUGO, pid_cmdline), | 2843 | INF("cmdline", S_IRUGO, proc_pid_cmdline), |
2818 | ONE("stat", S_IRUGO, tid_stat), | 2844 | ONE("stat", S_IRUGO, proc_tid_stat), |
2819 | ONE("statm", S_IRUGO, pid_statm), | 2845 | ONE("statm", S_IRUGO, proc_pid_statm), |
2820 | REG("maps", S_IRUGO, maps), | 2846 | REG("maps", S_IRUGO, proc_maps_operations), |
2821 | #ifdef CONFIG_NUMA | 2847 | #ifdef CONFIG_NUMA |
2822 | REG("numa_maps", S_IRUGO, numa_maps), | 2848 | REG("numa_maps", S_IRUGO, proc_numa_maps_operations), |
2823 | #endif | 2849 | #endif |
2824 | REG("mem", S_IRUSR|S_IWUSR, mem), | 2850 | REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), |
2825 | LNK("cwd", cwd), | 2851 | LNK("cwd", proc_cwd_link), |
2826 | LNK("root", root), | 2852 | LNK("root", proc_root_link), |
2827 | LNK("exe", exe), | 2853 | LNK("exe", proc_exe_link), |
2828 | REG("mounts", S_IRUGO, mounts), | 2854 | REG("mounts", S_IRUGO, proc_mounts_operations), |
2829 | REG("mountinfo", S_IRUGO, mountinfo), | 2855 | REG("mountinfo", S_IRUGO, proc_mountinfo_operations), |
2830 | #ifdef CONFIG_PROC_PAGE_MONITOR | 2856 | #ifdef CONFIG_PROC_PAGE_MONITOR |
2831 | REG("clear_refs", S_IWUSR, clear_refs), | 2857 | REG("clear_refs", S_IWUSR, proc_clear_refs_operations), |
2832 | REG("smaps", S_IRUGO, smaps), | 2858 | REG("smaps", S_IRUGO, proc_smaps_operations), |
2833 | REG("pagemap", S_IRUSR, pagemap), | 2859 | REG("pagemap", S_IRUSR, proc_pagemap_operations), |
2834 | #endif | 2860 | #endif |
2835 | #ifdef CONFIG_SECURITY | 2861 | #ifdef CONFIG_SECURITY |
2836 | DIR("attr", S_IRUGO|S_IXUGO, attr_dir), | 2862 | DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), |
2837 | #endif | 2863 | #endif |
2838 | #ifdef CONFIG_KALLSYMS | 2864 | #ifdef CONFIG_KALLSYMS |
2839 | INF("wchan", S_IRUGO, pid_wchan), | 2865 | INF("wchan", S_IRUGO, proc_pid_wchan), |
2866 | #endif | ||
2867 | #ifdef CONFIG_STACKTRACE | ||
2868 | ONE("stack", S_IRUSR, proc_pid_stack), | ||
2840 | #endif | 2869 | #endif |
2841 | #ifdef CONFIG_SCHEDSTATS | 2870 | #ifdef CONFIG_SCHEDSTATS |
2842 | INF("schedstat", S_IRUGO, pid_schedstat), | 2871 | INF("schedstat", S_IRUGO, proc_pid_schedstat), |
2843 | #endif | 2872 | #endif |
2844 | #ifdef CONFIG_LATENCYTOP | 2873 | #ifdef CONFIG_LATENCYTOP |
2845 | REG("latency", S_IRUGO, lstats), | 2874 | REG("latency", S_IRUGO, proc_lstats_operations), |
2846 | #endif | 2875 | #endif |
2847 | #ifdef CONFIG_PROC_PID_CPUSET | 2876 | #ifdef CONFIG_PROC_PID_CPUSET |
2848 | REG("cpuset", S_IRUGO, cpuset), | 2877 | REG("cpuset", S_IRUGO, proc_cpuset_operations), |
2849 | #endif | 2878 | #endif |
2850 | #ifdef CONFIG_CGROUPS | 2879 | #ifdef CONFIG_CGROUPS |
2851 | REG("cgroup", S_IRUGO, cgroup), | 2880 | REG("cgroup", S_IRUGO, proc_cgroup_operations), |
2852 | #endif | 2881 | #endif |
2853 | INF("oom_score", S_IRUGO, oom_score), | 2882 | INF("oom_score", S_IRUGO, proc_oom_score), |
2854 | REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), | 2883 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), |
2855 | #ifdef CONFIG_AUDITSYSCALL | 2884 | #ifdef CONFIG_AUDITSYSCALL |
2856 | REG("loginuid", S_IWUSR|S_IRUGO, loginuid), | 2885 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
2857 | REG("sessionid", S_IRUSR, sessionid), | 2886 | REG("sessionid", S_IRUSR, proc_sessionid_operations), |
2858 | #endif | 2887 | #endif |
2859 | #ifdef CONFIG_FAULT_INJECTION | 2888 | #ifdef CONFIG_FAULT_INJECTION |
2860 | REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), | 2889 | REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), |
2861 | #endif | 2890 | #endif |
2862 | #ifdef CONFIG_TASK_IO_ACCOUNTING | 2891 | #ifdef CONFIG_TASK_IO_ACCOUNTING |
2863 | INF("io", S_IRUGO, tid_io_accounting), | 2892 | INF("io", S_IRUGO, proc_tid_io_accounting), |
2864 | #endif | 2893 | #endif |
2865 | }; | 2894 | }; |
2866 | 2895 | ||
diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 60a359b35582..db7fa5cab988 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/stat.h> | 14 | #include <linux/stat.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/mount.h> | 16 | #include <linux/mount.h> |
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/init.h> | 17 | #include <linux/init.h> |
19 | #include <linux/idr.h> | 18 | #include <linux/idr.h> |
20 | #include <linux/namei.h> | 19 | #include <linux/namei.h> |
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | |||
379 | struct inode *inode = NULL; | 378 | struct inode *inode = NULL; |
380 | int error = -ENOENT; | 379 | int error = -ENOENT; |
381 | 380 | ||
382 | lock_kernel(); | ||
383 | spin_lock(&proc_subdir_lock); | 381 | spin_lock(&proc_subdir_lock); |
384 | for (de = de->subdir; de ; de = de->next) { | 382 | for (de = de->subdir; de ; de = de->next) { |
385 | if (de->namelen != dentry->d_name.len) | 383 | if (de->namelen != dentry->d_name.len) |
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | |||
397 | } | 395 | } |
398 | spin_unlock(&proc_subdir_lock); | 396 | spin_unlock(&proc_subdir_lock); |
399 | out_unlock: | 397 | out_unlock: |
400 | unlock_kernel(); | ||
401 | 398 | ||
402 | if (inode) { | 399 | if (inode) { |
403 | dentry->d_op = &proc_dentry_operations; | 400 | dentry->d_op = &proc_dentry_operations; |
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, | |||
432 | struct inode *inode = filp->f_path.dentry->d_inode; | 429 | struct inode *inode = filp->f_path.dentry->d_inode; |
433 | int ret = 0; | 430 | int ret = 0; |
434 | 431 | ||
435 | lock_kernel(); | ||
436 | |||
437 | ino = inode->i_ino; | 432 | ino = inode->i_ino; |
438 | i = filp->f_pos; | 433 | i = filp->f_pos; |
439 | switch (i) { | 434 | switch (i) { |
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, | |||
487 | spin_unlock(&proc_subdir_lock); | 482 | spin_unlock(&proc_subdir_lock); |
488 | } | 483 | } |
489 | ret = 1; | 484 | ret = 1; |
490 | out: unlock_kernel(); | 485 | out: |
491 | return ret; | 486 | return ret; |
492 | } | 487 | } |
493 | 488 | ||
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
504 | * the /proc directory. | 499 | * the /proc directory. |
505 | */ | 500 | */ |
506 | static const struct file_operations proc_dir_operations = { | 501 | static const struct file_operations proc_dir_operations = { |
502 | .llseek = generic_file_llseek, | ||
507 | .read = generic_read_dir, | 503 | .read = generic_read_dir, |
508 | .readdir = proc_readdir, | 504 | .readdir = proc_readdir, |
509 | }; | 505 | }; |
diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2543fd00c658..3e76bb9b3ad6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c | |||
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de) | |||
35 | */ | 35 | */ |
36 | void de_put(struct proc_dir_entry *de) | 36 | void de_put(struct proc_dir_entry *de) |
37 | { | 37 | { |
38 | lock_kernel(); | ||
39 | if (!atomic_read(&de->count)) { | 38 | if (!atomic_read(&de->count)) { |
40 | printk("de_put: entry %s already free!\n", de->name); | 39 | printk("de_put: entry %s already free!\n", de->name); |
41 | unlock_kernel(); | ||
42 | return; | 40 | return; |
43 | } | 41 | } |
44 | 42 | ||
45 | if (atomic_dec_and_test(&de->count)) | 43 | if (atomic_dec_and_test(&de->count)) |
46 | free_proc_entry(de); | 44 | free_proc_entry(de); |
47 | unlock_kernel(); | ||
48 | } | 45 | } |
49 | 46 | ||
50 | /* | 47 | /* |
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e8aeb8b61ce..cd53ff838498 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -41,8 +41,6 @@ do { \ | |||
41 | (vmi)->used = 0; \ | 41 | (vmi)->used = 0; \ |
42 | (vmi)->largest_chunk = 0; \ | 42 | (vmi)->largest_chunk = 0; \ |
43 | } while(0) | 43 | } while(0) |
44 | |||
45 | extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); | ||
46 | #endif | 44 | #endif |
47 | 45 | ||
48 | extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, | 46 | extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b1675c4e66da..43d23948384a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
74 | "LowTotal: %8lu kB\n" | 74 | "LowTotal: %8lu kB\n" |
75 | "LowFree: %8lu kB\n" | 75 | "LowFree: %8lu kB\n" |
76 | #endif | 76 | #endif |
77 | #ifndef CONFIG_MMU | ||
78 | "MmapCopy: %8lu kB\n" | ||
79 | #endif | ||
77 | "SwapTotal: %8lu kB\n" | 80 | "SwapTotal: %8lu kB\n" |
78 | "SwapFree: %8lu kB\n" | 81 | "SwapFree: %8lu kB\n" |
79 | "Dirty: %8lu kB\n" | 82 | "Dirty: %8lu kB\n" |
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
116 | K(i.totalram-i.totalhigh), | 119 | K(i.totalram-i.totalhigh), |
117 | K(i.freeram-i.freehigh), | 120 | K(i.freeram-i.freehigh), |
118 | #endif | 121 | #endif |
122 | #ifndef CONFIG_MMU | ||
123 | K((unsigned long) atomic_read(&mmap_pages_allocated)), | ||
124 | #endif | ||
119 | K(i.totalswap), | 125 | K(i.totalswap), |
120 | K(i.freeswap), | 126 | K(i.freeswap), |
121 | K(global_page_state(NR_FILE_DIRTY)), | 127 | K(global_page_state(NR_FILE_DIRTY)), |
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 3f87d2632947..b446d7ad0b0d 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c | |||
@@ -33,33 +33,33 @@ | |||
33 | #include "internal.h" | 33 | #include "internal.h" |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * display a single VMA to a sequenced file | 36 | * display a single region to a sequenced file |
37 | */ | 37 | */ |
38 | int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | 38 | static int nommu_region_show(struct seq_file *m, struct vm_region *region) |
39 | { | 39 | { |
40 | unsigned long ino = 0; | 40 | unsigned long ino = 0; |
41 | struct file *file; | 41 | struct file *file; |
42 | dev_t dev = 0; | 42 | dev_t dev = 0; |
43 | int flags, len; | 43 | int flags, len; |
44 | 44 | ||
45 | flags = vma->vm_flags; | 45 | flags = region->vm_flags; |
46 | file = vma->vm_file; | 46 | file = region->vm_file; |
47 | 47 | ||
48 | if (file) { | 48 | if (file) { |
49 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 49 | struct inode *inode = region->vm_file->f_path.dentry->d_inode; |
50 | dev = inode->i_sb->s_dev; | 50 | dev = inode->i_sb->s_dev; |
51 | ino = inode->i_ino; | 51 | ino = inode->i_ino; |
52 | } | 52 | } |
53 | 53 | ||
54 | seq_printf(m, | 54 | seq_printf(m, |
55 | "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", | 55 | "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", |
56 | vma->vm_start, | 56 | region->vm_start, |
57 | vma->vm_end, | 57 | region->vm_end, |
58 | flags & VM_READ ? 'r' : '-', | 58 | flags & VM_READ ? 'r' : '-', |
59 | flags & VM_WRITE ? 'w' : '-', | 59 | flags & VM_WRITE ? 'w' : '-', |
60 | flags & VM_EXEC ? 'x' : '-', | 60 | flags & VM_EXEC ? 'x' : '-', |
61 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', | 61 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', |
62 | ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, | 62 | ((loff_t)region->vm_pgoff) << PAGE_SHIFT, |
63 | MAJOR(dev), MINOR(dev), ino, &len); | 63 | MAJOR(dev), MINOR(dev), ino, &len); |
64 | 64 | ||
65 | if (file) { | 65 | if (file) { |
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | |||
75 | } | 75 | } |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * display a list of all the VMAs the kernel knows about | 78 | * display a list of all the REGIONs the kernel knows about |
79 | * - nommu kernals have a single flat list | 79 | * - nommu kernals have a single flat list |
80 | */ | 80 | */ |
81 | static int nommu_vma_list_show(struct seq_file *m, void *v) | 81 | static int nommu_region_list_show(struct seq_file *m, void *_p) |
82 | { | 82 | { |
83 | struct vm_area_struct *vma; | 83 | struct rb_node *p = _p; |
84 | 84 | ||
85 | vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); | 85 | return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); |
86 | return nommu_vma_show(m, vma); | ||
87 | } | 86 | } |
88 | 87 | ||
89 | static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) | 88 | static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) |
90 | { | 89 | { |
91 | struct rb_node *_rb; | 90 | struct rb_node *p; |
92 | loff_t pos = *_pos; | 91 | loff_t pos = *_pos; |
93 | void *next = NULL; | ||
94 | 92 | ||
95 | down_read(&nommu_vma_sem); | 93 | down_read(&nommu_region_sem); |
96 | 94 | ||
97 | for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { | 95 | for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) |
98 | if (pos == 0) { | 96 | if (pos-- == 0) |
99 | next = _rb; | 97 | return p; |
100 | break; | 98 | return NULL; |
101 | } | ||
102 | pos--; | ||
103 | } | ||
104 | |||
105 | return next; | ||
106 | } | 99 | } |
107 | 100 | ||
108 | static void nommu_vma_list_stop(struct seq_file *m, void *v) | 101 | static void nommu_region_list_stop(struct seq_file *m, void *v) |
109 | { | 102 | { |
110 | up_read(&nommu_vma_sem); | 103 | up_read(&nommu_region_sem); |
111 | } | 104 | } |
112 | 105 | ||
113 | static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) | 106 | static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) |
114 | { | 107 | { |
115 | (*pos)++; | 108 | (*pos)++; |
116 | return rb_next((struct rb_node *) v); | 109 | return rb_next((struct rb_node *) v); |
117 | } | 110 | } |
118 | 111 | ||
119 | static const struct seq_operations proc_nommu_vma_list_seqop = { | 112 | static struct seq_operations proc_nommu_region_list_seqop = { |
120 | .start = nommu_vma_list_start, | 113 | .start = nommu_region_list_start, |
121 | .next = nommu_vma_list_next, | 114 | .next = nommu_region_list_next, |
122 | .stop = nommu_vma_list_stop, | 115 | .stop = nommu_region_list_stop, |
123 | .show = nommu_vma_list_show | 116 | .show = nommu_region_list_show |
124 | }; | 117 | }; |
125 | 118 | ||
126 | static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) | 119 | static int proc_nommu_region_list_open(struct inode *inode, struct file *file) |
127 | { | 120 | { |
128 | return seq_open(file, &proc_nommu_vma_list_seqop); | 121 | return seq_open(file, &proc_nommu_region_list_seqop); |
129 | } | 122 | } |
130 | 123 | ||
131 | static const struct file_operations proc_nommu_vma_list_operations = { | 124 | static const struct file_operations proc_nommu_region_list_operations = { |
132 | .open = proc_nommu_vma_list_open, | 125 | .open = proc_nommu_region_list_open, |
133 | .read = seq_read, | 126 | .read = seq_read, |
134 | .llseek = seq_lseek, | 127 | .llseek = seq_lseek, |
135 | .release = seq_release, | 128 | .release = seq_release, |
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { | |||
137 | 130 | ||
138 | static int __init proc_nommu_init(void) | 131 | static int __init proc_nommu_init(void) |
139 | { | 132 | { |
140 | proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); | 133 | proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); |
141 | return 0; | 134 | return 0; |
142 | } | 135 | } |
143 | 136 | ||
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 7bc296f424ae..04d1270f1c38 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> |
21 | #include <linux/smp_lock.h> | ||
22 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
23 | #include <linux/nsproxy.h> | 22 | #include <linux/nsproxy.h> |
24 | #include <net/net_namespace.h> | 23 | #include <net/net_namespace.h> |
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, | |||
172 | } | 171 | } |
173 | 172 | ||
174 | const struct file_operations proc_net_operations = { | 173 | const struct file_operations proc_net_operations = { |
174 | .llseek = generic_file_llseek, | ||
175 | .read = generic_read_dir, | 175 | .read = generic_read_dir, |
176 | .readdir = proc_tgid_net_readdir, | 176 | .readdir = proc_tgid_net_readdir, |
177 | }; | 177 | }; |
diff --git a/fs/proc/root.c b/fs/proc/root.c index 7761602af9de..f6299a25594e 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/bitops.h> | 18 | #include <linux/bitops.h> |
19 | #include <linux/smp_lock.h> | ||
20 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
21 | #include <linux/pid_namespace.h> | 20 | #include <linux/pid_namespace.h> |
22 | 21 | ||
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp, | |||
162 | unsigned int nr = filp->f_pos; | 161 | unsigned int nr = filp->f_pos; |
163 | int ret; | 162 | int ret; |
164 | 163 | ||
165 | lock_kernel(); | ||
166 | |||
167 | if (nr < FIRST_PROCESS_ENTRY) { | 164 | if (nr < FIRST_PROCESS_ENTRY) { |
168 | int error = proc_readdir(filp, dirent, filldir); | 165 | int error = proc_readdir(filp, dirent, filldir); |
169 | if (error <= 0) { | 166 | if (error <= 0) |
170 | unlock_kernel(); | ||
171 | return error; | 167 | return error; |
172 | } | ||
173 | filp->f_pos = FIRST_PROCESS_ENTRY; | 168 | filp->f_pos = FIRST_PROCESS_ENTRY; |
174 | } | 169 | } |
175 | unlock_kernel(); | ||
176 | 170 | ||
177 | ret = proc_pid_readdir(filp, dirent, filldir); | 171 | ret = proc_pid_readdir(filp, dirent, filldir); |
178 | return ret; | 172 | return ret; |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3a8bdd7f5756..94063840832a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v) | |||
396 | "Private_Clean: %8lu kB\n" | 396 | "Private_Clean: %8lu kB\n" |
397 | "Private_Dirty: %8lu kB\n" | 397 | "Private_Dirty: %8lu kB\n" |
398 | "Referenced: %8lu kB\n" | 398 | "Referenced: %8lu kB\n" |
399 | "Swap: %8lu kB\n", | 399 | "Swap: %8lu kB\n" |
400 | "KernelPageSize: %8lu kB\n" | ||
401 | "MMUPageSize: %8lu kB\n", | ||
400 | (vma->vm_end - vma->vm_start) >> 10, | 402 | (vma->vm_end - vma->vm_start) >> 10, |
401 | mss.resident >> 10, | 403 | mss.resident >> 10, |
402 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), | 404 | (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), |
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v) | |||
405 | mss.private_clean >> 10, | 407 | mss.private_clean >> 10, |
406 | mss.private_dirty >> 10, | 408 | mss.private_dirty >> 10, |
407 | mss.referenced >> 10, | 409 | mss.referenced >> 10, |
408 | mss.swap >> 10); | 410 | mss.swap >> 10, |
411 | vma_kernel_pagesize(vma) >> 10, | ||
412 | vma_mmu_pagesize(vma) >> 10); | ||
409 | 413 | ||
410 | if (m->count < m->size) /* vma is copied successfully */ | 414 | if (m->count < m->size) /* vma is copied successfully */ |
411 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | 415 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; |
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 219bd79ea894..343ea1216bc8 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c | |||
@@ -9,31 +9,38 @@ | |||
9 | 9 | ||
10 | /* | 10 | /* |
11 | * Logic: we've got two memory sums for each process, "shared", and | 11 | * Logic: we've got two memory sums for each process, "shared", and |
12 | * "non-shared". Shared memory may get counted more then once, for | 12 | * "non-shared". Shared memory may get counted more than once, for |
13 | * each process that owns it. Non-shared memory is counted | 13 | * each process that owns it. Non-shared memory is counted |
14 | * accurately. | 14 | * accurately. |
15 | */ | 15 | */ |
16 | void task_mem(struct seq_file *m, struct mm_struct *mm) | 16 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
17 | { | 17 | { |
18 | struct vm_list_struct *vml; | 18 | struct vm_area_struct *vma; |
19 | unsigned long bytes = 0, sbytes = 0, slack = 0; | 19 | struct vm_region *region; |
20 | struct rb_node *p; | ||
21 | unsigned long bytes = 0, sbytes = 0, slack = 0, size; | ||
20 | 22 | ||
21 | down_read(&mm->mmap_sem); | 23 | down_read(&mm->mmap_sem); |
22 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | 24 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
23 | if (!vml->vma) | 25 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
24 | continue; | 26 | |
27 | bytes += kobjsize(vma); | ||
28 | |||
29 | region = vma->vm_region; | ||
30 | if (region) { | ||
31 | size = kobjsize(region); | ||
32 | size += region->vm_end - region->vm_start; | ||
33 | } else { | ||
34 | size = vma->vm_end - vma->vm_start; | ||
35 | } | ||
25 | 36 | ||
26 | bytes += kobjsize(vml); | ||
27 | if (atomic_read(&mm->mm_count) > 1 || | 37 | if (atomic_read(&mm->mm_count) > 1 || |
28 | atomic_read(&vml->vma->vm_usage) > 1 | 38 | vma->vm_flags & VM_MAYSHARE) { |
29 | ) { | 39 | sbytes += size; |
30 | sbytes += kobjsize((void *) vml->vma->vm_start); | ||
31 | sbytes += kobjsize(vml->vma); | ||
32 | } else { | 40 | } else { |
33 | bytes += kobjsize((void *) vml->vma->vm_start); | 41 | bytes += size; |
34 | bytes += kobjsize(vml->vma); | 42 | if (region) |
35 | slack += kobjsize((void *) vml->vma->vm_start) - | 43 | slack = region->vm_end - vma->vm_end; |
36 | (vml->vma->vm_end - vml->vma->vm_start); | ||
37 | } | 44 | } |
38 | } | 45 | } |
39 | 46 | ||
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
70 | 77 | ||
71 | unsigned long task_vsize(struct mm_struct *mm) | 78 | unsigned long task_vsize(struct mm_struct *mm) |
72 | { | 79 | { |
73 | struct vm_list_struct *tbp; | 80 | struct vm_area_struct *vma; |
81 | struct rb_node *p; | ||
74 | unsigned long vsize = 0; | 82 | unsigned long vsize = 0; |
75 | 83 | ||
76 | down_read(&mm->mmap_sem); | 84 | down_read(&mm->mmap_sem); |
77 | for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { | 85 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
78 | if (tbp->vma) | 86 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
79 | vsize += kobjsize((void *) tbp->vma->vm_start); | 87 | vsize += vma->vm_end - vma->vm_start; |
80 | } | 88 | } |
81 | up_read(&mm->mmap_sem); | 89 | up_read(&mm->mmap_sem); |
82 | return vsize; | 90 | return vsize; |
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm) | |||
85 | int task_statm(struct mm_struct *mm, int *shared, int *text, | 93 | int task_statm(struct mm_struct *mm, int *shared, int *text, |
86 | int *data, int *resident) | 94 | int *data, int *resident) |
87 | { | 95 | { |
88 | struct vm_list_struct *tbp; | 96 | struct vm_area_struct *vma; |
97 | struct vm_region *region; | ||
98 | struct rb_node *p; | ||
89 | int size = kobjsize(mm); | 99 | int size = kobjsize(mm); |
90 | 100 | ||
91 | down_read(&mm->mmap_sem); | 101 | down_read(&mm->mmap_sem); |
92 | for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { | 102 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { |
93 | size += kobjsize(tbp); | 103 | vma = rb_entry(p, struct vm_area_struct, vm_rb); |
94 | if (tbp->vma) { | 104 | size += kobjsize(vma); |
95 | size += kobjsize(tbp->vma); | 105 | region = vma->vm_region; |
96 | size += kobjsize((void *) tbp->vma->vm_start); | 106 | if (region) { |
107 | size += kobjsize(region); | ||
108 | size += region->vm_end - region->vm_start; | ||
97 | } | 109 | } |
98 | } | 110 | } |
99 | 111 | ||
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, | |||
105 | } | 117 | } |
106 | 118 | ||
107 | /* | 119 | /* |
120 | * display a single VMA to a sequenced file | ||
121 | */ | ||
122 | static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | ||
123 | { | ||
124 | unsigned long ino = 0; | ||
125 | struct file *file; | ||
126 | dev_t dev = 0; | ||
127 | int flags, len; | ||
128 | |||
129 | flags = vma->vm_flags; | ||
130 | file = vma->vm_file; | ||
131 | |||
132 | if (file) { | ||
133 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | ||
134 | dev = inode->i_sb->s_dev; | ||
135 | ino = inode->i_ino; | ||
136 | } | ||
137 | |||
138 | seq_printf(m, | ||
139 | "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", | ||
140 | vma->vm_start, | ||
141 | vma->vm_end, | ||
142 | flags & VM_READ ? 'r' : '-', | ||
143 | flags & VM_WRITE ? 'w' : '-', | ||
144 | flags & VM_EXEC ? 'x' : '-', | ||
145 | flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', | ||
146 | vma->vm_pgoff << PAGE_SHIFT, | ||
147 | MAJOR(dev), MINOR(dev), ino, &len); | ||
148 | |||
149 | if (file) { | ||
150 | len = 25 + sizeof(void *) * 6 - len; | ||
151 | if (len < 1) | ||
152 | len = 1; | ||
153 | seq_printf(m, "%*c", len, ' '); | ||
154 | seq_path(m, &file->f_path, ""); | ||
155 | } | ||
156 | |||
157 | seq_putc(m, '\n'); | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | /* | ||
108 | * display mapping lines for a particular process's /proc/pid/maps | 162 | * display mapping lines for a particular process's /proc/pid/maps |
109 | */ | 163 | */ |
110 | static int show_map(struct seq_file *m, void *_vml) | 164 | static int show_map(struct seq_file *m, void *_p) |
111 | { | 165 | { |
112 | struct vm_list_struct *vml = _vml; | 166 | struct rb_node *p = _p; |
113 | 167 | ||
114 | return nommu_vma_show(m, vml->vma); | 168 | return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); |
115 | } | 169 | } |
116 | 170 | ||
117 | static void *m_start(struct seq_file *m, loff_t *pos) | 171 | static void *m_start(struct seq_file *m, loff_t *pos) |
118 | { | 172 | { |
119 | struct proc_maps_private *priv = m->private; | 173 | struct proc_maps_private *priv = m->private; |
120 | struct vm_list_struct *vml; | ||
121 | struct mm_struct *mm; | 174 | struct mm_struct *mm; |
175 | struct rb_node *p; | ||
122 | loff_t n = *pos; | 176 | loff_t n = *pos; |
123 | 177 | ||
124 | /* pin the task and mm whilst we play with them */ | 178 | /* pin the task and mm whilst we play with them */ |
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) | |||
134 | } | 188 | } |
135 | 189 | ||
136 | /* start from the Nth VMA */ | 190 | /* start from the Nth VMA */ |
137 | for (vml = mm->context.vmlist; vml; vml = vml->next) | 191 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) |
138 | if (n-- == 0) | 192 | if (n-- == 0) |
139 | return vml; | 193 | return p; |
140 | return NULL; | 194 | return NULL; |
141 | } | 195 | } |
142 | 196 | ||
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml) | |||
152 | } | 206 | } |
153 | } | 207 | } |
154 | 208 | ||
155 | static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) | 209 | static void *m_next(struct seq_file *m, void *_p, loff_t *pos) |
156 | { | 210 | { |
157 | struct vm_list_struct *vml = _vml; | 211 | struct rb_node *p = _p; |
158 | 212 | ||
159 | (*pos)++; | 213 | (*pos)++; |
160 | return vml ? vml->next : NULL; | 214 | return p ? rb_next(p) : NULL; |
161 | } | 215 | } |
162 | 216 | ||
163 | static const struct seq_operations proc_pid_maps_ops = { | 217 | static const struct seq_operations proc_pid_maps_ops = { |
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 03ec59504906..5edcc3f92ba7 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c | |||
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, | |||
47 | 47 | ||
48 | offset = (unsigned long)(*ppos % PAGE_SIZE); | 48 | offset = (unsigned long)(*ppos % PAGE_SIZE); |
49 | pfn = (unsigned long)(*ppos / PAGE_SIZE); | 49 | pfn = (unsigned long)(*ppos / PAGE_SIZE); |
50 | if (pfn > saved_max_pfn) | ||
51 | return -EINVAL; | ||
52 | 50 | ||
53 | do { | 51 | do { |
54 | if (count > (PAGE_SIZE - offset)) | 52 | if (count > (PAGE_SIZE - offset)) |
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 76acdbc34611..b9b567a28376 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c | |||
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, | |||
262 | ret = -ENOMEM; | 262 | ret = -ENOMEM; |
263 | pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); | 263 | pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); |
264 | if (!pages) | 264 | if (!pages) |
265 | goto out; | 265 | goto out_free; |
266 | 266 | ||
267 | nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); | 267 | nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); |
268 | if (nr != lpages) | 268 | if (nr != lpages) |
269 | goto out; /* leave if some pages were missing */ | 269 | goto out_free_pages; /* leave if some pages were missing */ |
270 | 270 | ||
271 | /* check the pages for physical adjacency */ | 271 | /* check the pages for physical adjacency */ |
272 | ptr = pages; | 272 | ptr = pages; |
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, | |||
274 | page++; | 274 | page++; |
275 | for (loop = lpages; loop > 1; loop--) | 275 | for (loop = lpages; loop > 1; loop--) |
276 | if (*ptr++ != page++) | 276 | if (*ptr++ != page++) |
277 | goto out; | 277 | goto out_free_pages; |
278 | 278 | ||
279 | /* okay - all conditions fulfilled */ | 279 | /* okay - all conditions fulfilled */ |
280 | ret = (unsigned long) page_address(pages[0]); | 280 | ret = (unsigned long) page_address(pages[0]); |
281 | 281 | ||
282 | out: | 282 | out_free_pages: |
283 | if (pages) { | 283 | ptr = pages; |
284 | ptr = pages; | 284 | for (loop = nr; loop > 0; loop--) |
285 | for (loop = lpages; loop > 0; loop--) | 285 | put_page(*ptr++); |
286 | put_page(*ptr++); | 286 | out_free: |
287 | kfree(pages); | 287 | kfree(pages); |
288 | } | 288 | out: |
289 | |||
290 | return ret; | 289 | return ret; |
291 | } | 290 | } |
292 | 291 | ||
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c55651f1407c..f3c820b75829 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c | |||
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s) | |||
83 | reiserfs_sync_fs(s, 1); | 83 | reiserfs_sync_fs(s, 1); |
84 | } | 84 | } |
85 | 85 | ||
86 | static void reiserfs_write_super_lockfs(struct super_block *s) | 86 | static int reiserfs_freeze(struct super_block *s) |
87 | { | 87 | { |
88 | struct reiserfs_transaction_handle th; | 88 | struct reiserfs_transaction_handle th; |
89 | reiserfs_write_lock(s); | 89 | reiserfs_write_lock(s); |
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s) | |||
101 | } | 101 | } |
102 | s->s_dirt = 0; | 102 | s->s_dirt = 0; |
103 | reiserfs_write_unlock(s); | 103 | reiserfs_write_unlock(s); |
104 | return 0; | ||
104 | } | 105 | } |
105 | 106 | ||
106 | static void reiserfs_unlockfs(struct super_block *s) | 107 | static int reiserfs_unfreeze(struct super_block *s) |
107 | { | 108 | { |
108 | reiserfs_allow_writes(s); | 109 | reiserfs_allow_writes(s); |
110 | return 0; | ||
109 | } | 111 | } |
110 | 112 | ||
111 | extern const struct in_core_key MAX_IN_CORE_KEY; | 113 | extern const struct in_core_key MAX_IN_CORE_KEY; |
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = { | |||
613 | .put_super = reiserfs_put_super, | 615 | .put_super = reiserfs_put_super, |
614 | .write_super = reiserfs_write_super, | 616 | .write_super = reiserfs_write_super, |
615 | .sync_fs = reiserfs_sync_fs, | 617 | .sync_fs = reiserfs_sync_fs, |
616 | .write_super_lockfs = reiserfs_write_super_lockfs, | 618 | .freeze_fs = reiserfs_freeze, |
617 | .unlockfs = reiserfs_unlockfs, | 619 | .unfreeze_fs = reiserfs_unfreeze, |
618 | .statfs = reiserfs_statfs, | 620 | .statfs = reiserfs_statfs, |
619 | .remount_fs = reiserfs_remount, | 621 | .remount_fs = reiserfs_remount, |
620 | .show_options = generic_show_options, | 622 | .show_options = generic_show_options, |
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index c97d4c931715..98a232f7196b 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c | |||
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] = | |||
490 | static struct inode * | 490 | static struct inode * |
491 | romfs_iget(struct super_block *sb, unsigned long ino) | 491 | romfs_iget(struct super_block *sb, unsigned long ino) |
492 | { | 492 | { |
493 | int nextfh; | 493 | int nextfh, ret; |
494 | struct romfs_inode ri; | 494 | struct romfs_inode ri; |
495 | struct inode *i; | 495 | struct inode *i; |
496 | 496 | ||
@@ -526,11 +526,11 @@ romfs_iget(struct super_block *sb, unsigned long ino) | |||
526 | i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; | 526 | i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; |
527 | 527 | ||
528 | /* Precalculate the data offset */ | 528 | /* Precalculate the data offset */ |
529 | ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); | 529 | ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); |
530 | if (ino >= 0) | 530 | if (ret >= 0) |
531 | ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); | 531 | ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; |
532 | else | 532 | else |
533 | ino = 0; | 533 | ino = 0; |
534 | 534 | ||
535 | ROMFS_I(i)->i_metasize = ino; | 535 | ROMFS_I(i)->i_metasize = ino; |
536 | ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); | 536 | ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); |
diff --git a/fs/select.c b/fs/select.c index 87df51eadcf2..08b91beed806 100644 --- a/fs/select.c +++ b/fs/select.c | |||
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, | |||
109 | void poll_initwait(struct poll_wqueues *pwq) | 109 | void poll_initwait(struct poll_wqueues *pwq) |
110 | { | 110 | { |
111 | init_poll_funcptr(&pwq->pt, __pollwait); | 111 | init_poll_funcptr(&pwq->pt, __pollwait); |
112 | pwq->polling_task = current; | ||
112 | pwq->error = 0; | 113 | pwq->error = 0; |
113 | pwq->table = NULL; | 114 | pwq->table = NULL; |
114 | pwq->inline_index = 0; | 115 | pwq->inline_index = 0; |
115 | } | 116 | } |
116 | |||
117 | EXPORT_SYMBOL(poll_initwait); | 117 | EXPORT_SYMBOL(poll_initwait); |
118 | 118 | ||
119 | static void free_poll_entry(struct poll_table_entry *entry) | 119 | static void free_poll_entry(struct poll_table_entry *entry) |
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq) | |||
142 | free_page((unsigned long) old); | 142 | free_page((unsigned long) old); |
143 | } | 143 | } |
144 | } | 144 | } |
145 | |||
146 | EXPORT_SYMBOL(poll_freewait); | 145 | EXPORT_SYMBOL(poll_freewait); |
147 | 146 | ||
148 | static struct poll_table_entry *poll_get_entry(poll_table *_p) | 147 | static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) |
149 | { | 148 | { |
150 | struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); | ||
151 | struct poll_table_page *table = p->table; | 149 | struct poll_table_page *table = p->table; |
152 | 150 | ||
153 | if (p->inline_index < N_INLINE_POLL_ENTRIES) | 151 | if (p->inline_index < N_INLINE_POLL_ENTRIES) |
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) | |||
159 | new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); | 157 | new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); |
160 | if (!new_table) { | 158 | if (!new_table) { |
161 | p->error = -ENOMEM; | 159 | p->error = -ENOMEM; |
162 | __set_current_state(TASK_RUNNING); | ||
163 | return NULL; | 160 | return NULL; |
164 | } | 161 | } |
165 | new_table->entry = new_table->entries; | 162 | new_table->entry = new_table->entries; |
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) | |||
171 | return table->entry++; | 168 | return table->entry++; |
172 | } | 169 | } |
173 | 170 | ||
171 | static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
172 | { | ||
173 | struct poll_wqueues *pwq = wait->private; | ||
174 | DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); | ||
175 | |||
176 | /* | ||
177 | * Although this function is called under waitqueue lock, LOCK | ||
178 | * doesn't imply write barrier and the users expect write | ||
179 | * barrier semantics on wakeup functions. The following | ||
180 | * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() | ||
181 | * and is paired with set_mb() in poll_schedule_timeout. | ||
182 | */ | ||
183 | smp_wmb(); | ||
184 | pwq->triggered = 1; | ||
185 | |||
186 | /* | ||
187 | * Perform the default wake up operation using a dummy | ||
188 | * waitqueue. | ||
189 | * | ||
190 | * TODO: This is hacky but there currently is no interface to | ||
191 | * pass in @sync. @sync is scheduled to be removed and once | ||
192 | * that happens, wake_up_process() can be used directly. | ||
193 | */ | ||
194 | return default_wake_function(&dummy_wait, mode, sync, key); | ||
195 | } | ||
196 | |||
174 | /* Add a new entry */ | 197 | /* Add a new entry */ |
175 | static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, | 198 | static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, |
176 | poll_table *p) | 199 | poll_table *p) |
177 | { | 200 | { |
178 | struct poll_table_entry *entry = poll_get_entry(p); | 201 | struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); |
202 | struct poll_table_entry *entry = poll_get_entry(pwq); | ||
179 | if (!entry) | 203 | if (!entry) |
180 | return; | 204 | return; |
181 | get_file(filp); | 205 | get_file(filp); |
182 | entry->filp = filp; | 206 | entry->filp = filp; |
183 | entry->wait_address = wait_address; | 207 | entry->wait_address = wait_address; |
184 | init_waitqueue_entry(&entry->wait, current); | 208 | init_waitqueue_func_entry(&entry->wait, pollwake); |
209 | entry->wait.private = pwq; | ||
185 | add_wait_queue(wait_address, &entry->wait); | 210 | add_wait_queue(wait_address, &entry->wait); |
186 | } | 211 | } |
187 | 212 | ||
213 | int poll_schedule_timeout(struct poll_wqueues *pwq, int state, | ||
214 | ktime_t *expires, unsigned long slack) | ||
215 | { | ||
216 | int rc = -EINTR; | ||
217 | |||
218 | set_current_state(state); | ||
219 | if (!pwq->triggered) | ||
220 | rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); | ||
221 | __set_current_state(TASK_RUNNING); | ||
222 | |||
223 | /* | ||
224 | * Prepare for the next iteration. | ||
225 | * | ||
226 | * The following set_mb() serves two purposes. First, it's | ||
227 | * the counterpart rmb of the wmb in pollwake() such that data | ||
228 | * written before wake up is always visible after wake up. | ||
229 | * Second, the full barrier guarantees that triggered clearing | ||
230 | * doesn't pass event check of the next iteration. Note that | ||
231 | * this problem doesn't exist for the first iteration as | ||
232 | * add_wait_queue() has full barrier semantics. | ||
233 | */ | ||
234 | set_mb(pwq->triggered, 0); | ||
235 | |||
236 | return rc; | ||
237 | } | ||
238 | EXPORT_SYMBOL(poll_schedule_timeout); | ||
239 | |||
188 | /** | 240 | /** |
189 | * poll_select_set_timeout - helper function to setup the timeout value | 241 | * poll_select_set_timeout - helper function to setup the timeout value |
190 | * @to: pointer to timespec variable for the final timeout | 242 | * @to: pointer to timespec variable for the final timeout |
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | |||
340 | for (;;) { | 392 | for (;;) { |
341 | unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; | 393 | unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; |
342 | 394 | ||
343 | set_current_state(TASK_INTERRUPTIBLE); | ||
344 | |||
345 | inp = fds->in; outp = fds->out; exp = fds->ex; | 395 | inp = fds->in; outp = fds->out; exp = fds->ex; |
346 | rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; | 396 | rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; |
347 | 397 | ||
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) | |||
411 | to = &expire; | 461 | to = &expire; |
412 | } | 462 | } |
413 | 463 | ||
414 | if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) | 464 | if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, |
465 | to, slack)) | ||
415 | timed_out = 1; | 466 | timed_out = 1; |
416 | } | 467 | } |
417 | __set_current_state(TASK_RUNNING); | ||
418 | 468 | ||
419 | poll_freewait(&table); | 469 | poll_freewait(&table); |
420 | 470 | ||
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list, | |||
666 | for (;;) { | 716 | for (;;) { |
667 | struct poll_list *walk; | 717 | struct poll_list *walk; |
668 | 718 | ||
669 | set_current_state(TASK_INTERRUPTIBLE); | ||
670 | for (walk = list; walk != NULL; walk = walk->next) { | 719 | for (walk = list; walk != NULL; walk = walk->next) { |
671 | struct pollfd * pfd, * pfd_end; | 720 | struct pollfd * pfd, * pfd_end; |
672 | 721 | ||
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, | |||
709 | to = &expire; | 758 | to = &expire; |
710 | } | 759 | } |
711 | 760 | ||
712 | if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) | 761 | if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) |
713 | timed_out = 1; | 762 | timed_out = 1; |
714 | } | 763 | } |
715 | __set_current_state(TASK_RUNNING); | ||
716 | return count; | 764 | return count; |
717 | } | 765 | } |
718 | 766 | ||
diff --git a/fs/splice.c b/fs/splice.c index 1abab5cee4ba..a54b3e3f10a7 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/splice.h> | 23 | #include <linux/splice.h> |
24 | #include <linux/memcontrol.h> | ||
24 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
25 | #include <linux/swap.h> | 26 | #include <linux/swap.h> |
26 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile new file mode 100644 index 000000000000..8258cf9a0317 --- /dev/null +++ b/fs/squashfs/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # | ||
2 | # Makefile for the linux squashfs routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_SQUASHFS) += squashfs.o | ||
6 | squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o | ||
7 | squashfs-y += namei.o super.o symlink.o | ||
8 | #squashfs-y += squashfs2_0.o | ||
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c new file mode 100644 index 000000000000..c837dfc2b3c6 --- /dev/null +++ b/fs/squashfs/block.c | |||
@@ -0,0 +1,274 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * block.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements the low-level routines to read and decompress | ||
26 | * datablocks and metadata blocks. | ||
27 | */ | ||
28 | |||
29 | #include <linux/fs.h> | ||
30 | #include <linux/vfs.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/mutex.h> | ||
33 | #include <linux/string.h> | ||
34 | #include <linux/buffer_head.h> | ||
35 | #include <linux/zlib.h> | ||
36 | |||
37 | #include "squashfs_fs.h" | ||
38 | #include "squashfs_fs_sb.h" | ||
39 | #include "squashfs_fs_i.h" | ||
40 | #include "squashfs.h" | ||
41 | |||
42 | /* | ||
43 | * Read the metadata block length, this is stored in the first two | ||
44 | * bytes of the metadata block. | ||
45 | */ | ||
46 | static struct buffer_head *get_block_length(struct super_block *sb, | ||
47 | u64 *cur_index, int *offset, int *length) | ||
48 | { | ||
49 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
50 | struct buffer_head *bh; | ||
51 | |||
52 | bh = sb_bread(sb, *cur_index); | ||
53 | if (bh == NULL) | ||
54 | return NULL; | ||
55 | |||
56 | if (msblk->devblksize - *offset == 1) { | ||
57 | *length = (unsigned char) bh->b_data[*offset]; | ||
58 | put_bh(bh); | ||
59 | bh = sb_bread(sb, ++(*cur_index)); | ||
60 | if (bh == NULL) | ||
61 | return NULL; | ||
62 | *length |= (unsigned char) bh->b_data[0] << 8; | ||
63 | *offset = 1; | ||
64 | } else { | ||
65 | *length = (unsigned char) bh->b_data[*offset] | | ||
66 | (unsigned char) bh->b_data[*offset + 1] << 8; | ||
67 | *offset += 2; | ||
68 | } | ||
69 | |||
70 | return bh; | ||
71 | } | ||
72 | |||
73 | |||
74 | /* | ||
75 | * Read and decompress a metadata block or datablock. Length is non-zero | ||
76 | * if a datablock is being read (the size is stored elsewhere in the | ||
77 | * filesystem), otherwise the length is obtained from the first two bytes of | ||
78 | * the metadata block. A bit in the length field indicates if the block | ||
79 | * is stored uncompressed in the filesystem (usually because compression | ||
80 | * generated a larger block - this does occasionally happen with zlib). | ||
81 | */ | ||
82 | int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, | ||
83 | int length, u64 *next_index, int srclength) | ||
84 | { | ||
85 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
86 | struct buffer_head **bh; | ||
87 | int offset = index & ((1 << msblk->devblksize_log2) - 1); | ||
88 | u64 cur_index = index >> msblk->devblksize_log2; | ||
89 | int bytes, compressed, b = 0, k = 0, page = 0, avail; | ||
90 | |||
91 | |||
92 | bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, | ||
93 | sizeof(*bh), GFP_KERNEL); | ||
94 | if (bh == NULL) | ||
95 | return -ENOMEM; | ||
96 | |||
97 | if (length) { | ||
98 | /* | ||
99 | * Datablock. | ||
100 | */ | ||
101 | bytes = -offset; | ||
102 | compressed = SQUASHFS_COMPRESSED_BLOCK(length); | ||
103 | length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); | ||
104 | if (next_index) | ||
105 | *next_index = index + length; | ||
106 | |||
107 | TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", | ||
108 | index, compressed ? "" : "un", length, srclength); | ||
109 | |||
110 | if (length < 0 || length > srclength || | ||
111 | (index + length) > msblk->bytes_used) | ||
112 | goto read_failure; | ||
113 | |||
114 | for (b = 0; bytes < length; b++, cur_index++) { | ||
115 | bh[b] = sb_getblk(sb, cur_index); | ||
116 | if (bh[b] == NULL) | ||
117 | goto block_release; | ||
118 | bytes += msblk->devblksize; | ||
119 | } | ||
120 | ll_rw_block(READ, b, bh); | ||
121 | } else { | ||
122 | /* | ||
123 | * Metadata block. | ||
124 | */ | ||
125 | if ((index + 2) > msblk->bytes_used) | ||
126 | goto read_failure; | ||
127 | |||
128 | bh[0] = get_block_length(sb, &cur_index, &offset, &length); | ||
129 | if (bh[0] == NULL) | ||
130 | goto read_failure; | ||
131 | b = 1; | ||
132 | |||
133 | bytes = msblk->devblksize - offset; | ||
134 | compressed = SQUASHFS_COMPRESSED(length); | ||
135 | length = SQUASHFS_COMPRESSED_SIZE(length); | ||
136 | if (next_index) | ||
137 | *next_index = index + length + 2; | ||
138 | |||
139 | TRACE("Block @ 0x%llx, %scompressed size %d\n", index, | ||
140 | compressed ? "" : "un", length); | ||
141 | |||
142 | if (length < 0 || length > srclength || | ||
143 | (index + length) > msblk->bytes_used) | ||
144 | goto block_release; | ||
145 | |||
146 | for (; bytes < length; b++) { | ||
147 | bh[b] = sb_getblk(sb, ++cur_index); | ||
148 | if (bh[b] == NULL) | ||
149 | goto block_release; | ||
150 | bytes += msblk->devblksize; | ||
151 | } | ||
152 | ll_rw_block(READ, b - 1, bh + 1); | ||
153 | } | ||
154 | |||
155 | if (compressed) { | ||
156 | int zlib_err = 0, zlib_init = 0; | ||
157 | |||
158 | /* | ||
159 | * Uncompress block. | ||
160 | */ | ||
161 | |||
162 | mutex_lock(&msblk->read_data_mutex); | ||
163 | |||
164 | msblk->stream.avail_out = 0; | ||
165 | msblk->stream.avail_in = 0; | ||
166 | |||
167 | bytes = length; | ||
168 | do { | ||
169 | if (msblk->stream.avail_in == 0 && k < b) { | ||
170 | avail = min(bytes, msblk->devblksize - offset); | ||
171 | bytes -= avail; | ||
172 | wait_on_buffer(bh[k]); | ||
173 | if (!buffer_uptodate(bh[k])) | ||
174 | goto release_mutex; | ||
175 | |||
176 | if (avail == 0) { | ||
177 | offset = 0; | ||
178 | put_bh(bh[k++]); | ||
179 | continue; | ||
180 | } | ||
181 | |||
182 | msblk->stream.next_in = bh[k]->b_data + offset; | ||
183 | msblk->stream.avail_in = avail; | ||
184 | offset = 0; | ||
185 | } | ||
186 | |||
187 | if (msblk->stream.avail_out == 0) { | ||
188 | msblk->stream.next_out = buffer[page++]; | ||
189 | msblk->stream.avail_out = PAGE_CACHE_SIZE; | ||
190 | } | ||
191 | |||
192 | if (!zlib_init) { | ||
193 | zlib_err = zlib_inflateInit(&msblk->stream); | ||
194 | if (zlib_err != Z_OK) { | ||
195 | ERROR("zlib_inflateInit returned" | ||
196 | " unexpected result 0x%x," | ||
197 | " srclength %d\n", zlib_err, | ||
198 | srclength); | ||
199 | goto release_mutex; | ||
200 | } | ||
201 | zlib_init = 1; | ||
202 | } | ||
203 | |||
204 | zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); | ||
205 | |||
206 | if (msblk->stream.avail_in == 0 && k < b) | ||
207 | put_bh(bh[k++]); | ||
208 | } while (zlib_err == Z_OK); | ||
209 | |||
210 | if (zlib_err != Z_STREAM_END) { | ||
211 | ERROR("zlib_inflate returned unexpected result" | ||
212 | " 0x%x, srclength %d, avail_in %d," | ||
213 | " avail_out %d\n", zlib_err, srclength, | ||
214 | msblk->stream.avail_in, | ||
215 | msblk->stream.avail_out); | ||
216 | goto release_mutex; | ||
217 | } | ||
218 | |||
219 | zlib_err = zlib_inflateEnd(&msblk->stream); | ||
220 | if (zlib_err != Z_OK) { | ||
221 | ERROR("zlib_inflateEnd returned unexpected result 0x%x," | ||
222 | " srclength %d\n", zlib_err, srclength); | ||
223 | goto release_mutex; | ||
224 | } | ||
225 | length = msblk->stream.total_out; | ||
226 | mutex_unlock(&msblk->read_data_mutex); | ||
227 | } else { | ||
228 | /* | ||
229 | * Block is uncompressed. | ||
230 | */ | ||
231 | int i, in, pg_offset = 0; | ||
232 | |||
233 | for (i = 0; i < b; i++) { | ||
234 | wait_on_buffer(bh[i]); | ||
235 | if (!buffer_uptodate(bh[i])) | ||
236 | goto block_release; | ||
237 | } | ||
238 | |||
239 | for (bytes = length; k < b; k++) { | ||
240 | in = min(bytes, msblk->devblksize - offset); | ||
241 | bytes -= in; | ||
242 | while (in) { | ||
243 | if (pg_offset == PAGE_CACHE_SIZE) { | ||
244 | page++; | ||
245 | pg_offset = 0; | ||
246 | } | ||
247 | avail = min_t(int, in, PAGE_CACHE_SIZE - | ||
248 | pg_offset); | ||
249 | memcpy(buffer[page] + pg_offset, | ||
250 | bh[k]->b_data + offset, avail); | ||
251 | in -= avail; | ||
252 | pg_offset += avail; | ||
253 | offset += avail; | ||
254 | } | ||
255 | offset = 0; | ||
256 | put_bh(bh[k]); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | kfree(bh); | ||
261 | return length; | ||
262 | |||
263 | release_mutex: | ||
264 | mutex_unlock(&msblk->read_data_mutex); | ||
265 | |||
266 | block_release: | ||
267 | for (; k < b; k++) | ||
268 | put_bh(bh[k]); | ||
269 | |||
270 | read_failure: | ||
271 | ERROR("sb_bread failed reading block 0x%llx\n", cur_index); | ||
272 | kfree(bh); | ||
273 | return -EIO; | ||
274 | } | ||
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c new file mode 100644 index 000000000000..f29eda16d25e --- /dev/null +++ b/fs/squashfs/cache.c | |||
@@ -0,0 +1,412 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * cache.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * Blocks in Squashfs are compressed. To avoid repeatedly decompressing | ||
26 | * recently accessed data Squashfs uses two small metadata and fragment caches. | ||
27 | * | ||
28 | * This file implements a generic cache implementation used for both caches, | ||
29 | * plus functions layered ontop of the generic cache implementation to | ||
30 | * access the metadata and fragment caches. | ||
31 | * | ||
32 | * To avoid out of memory and fragmentation isssues with vmalloc the cache | ||
33 | * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. | ||
34 | * | ||
35 | * It should be noted that the cache is not used for file datablocks, these | ||
36 | * are decompressed and cached in the page-cache in the normal way. The | ||
37 | * cache is only used to temporarily cache fragment and metadata blocks | ||
38 | * which have been read as as a result of a metadata (i.e. inode or | ||
39 | * directory) or fragment access. Because metadata and fragments are packed | ||
40 | * together into blocks (to gain greater compression) the read of a particular | ||
41 | * piece of metadata or fragment will retrieve other metadata/fragments which | ||
42 | * have been packed with it, these because of locality-of-reference may be read | ||
43 | * in the near future. Temporarily caching them ensures they are available for | ||
44 | * near future access without requiring an additional read and decompress. | ||
45 | */ | ||
46 | |||
47 | #include <linux/fs.h> | ||
48 | #include <linux/vfs.h> | ||
49 | #include <linux/slab.h> | ||
50 | #include <linux/vmalloc.h> | ||
51 | #include <linux/sched.h> | ||
52 | #include <linux/spinlock.h> | ||
53 | #include <linux/wait.h> | ||
54 | #include <linux/zlib.h> | ||
55 | #include <linux/pagemap.h> | ||
56 | |||
57 | #include "squashfs_fs.h" | ||
58 | #include "squashfs_fs_sb.h" | ||
59 | #include "squashfs_fs_i.h" | ||
60 | #include "squashfs.h" | ||
61 | |||
62 | /* | ||
63 | * Look-up block in cache, and increment usage count. If not in cache, read | ||
64 | * and decompress it from disk. | ||
65 | */ | ||
66 | struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, | ||
67 | struct squashfs_cache *cache, u64 block, int length) | ||
68 | { | ||
69 | int i, n; | ||
70 | struct squashfs_cache_entry *entry; | ||
71 | |||
72 | spin_lock(&cache->lock); | ||
73 | |||
74 | while (1) { | ||
75 | for (i = 0; i < cache->entries; i++) | ||
76 | if (cache->entry[i].block == block) | ||
77 | break; | ||
78 | |||
79 | if (i == cache->entries) { | ||
80 | /* | ||
81 | * Block not in cache, if all cache entries are used | ||
82 | * go to sleep waiting for one to become available. | ||
83 | */ | ||
84 | if (cache->unused == 0) { | ||
85 | cache->num_waiters++; | ||
86 | spin_unlock(&cache->lock); | ||
87 | wait_event(cache->wait_queue, cache->unused); | ||
88 | spin_lock(&cache->lock); | ||
89 | cache->num_waiters--; | ||
90 | continue; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * At least one unused cache entry. A simple | ||
95 | * round-robin strategy is used to choose the entry to | ||
96 | * be evicted from the cache. | ||
97 | */ | ||
98 | i = cache->next_blk; | ||
99 | for (n = 0; n < cache->entries; n++) { | ||
100 | if (cache->entry[i].refcount == 0) | ||
101 | break; | ||
102 | i = (i + 1) % cache->entries; | ||
103 | } | ||
104 | |||
105 | cache->next_blk = (i + 1) % cache->entries; | ||
106 | entry = &cache->entry[i]; | ||
107 | |||
108 | /* | ||
109 | * Initialise choosen cache entry, and fill it in from | ||
110 | * disk. | ||
111 | */ | ||
112 | cache->unused--; | ||
113 | entry->block = block; | ||
114 | entry->refcount = 1; | ||
115 | entry->pending = 1; | ||
116 | entry->num_waiters = 0; | ||
117 | entry->error = 0; | ||
118 | spin_unlock(&cache->lock); | ||
119 | |||
120 | entry->length = squashfs_read_data(sb, entry->data, | ||
121 | block, length, &entry->next_index, | ||
122 | cache->block_size); | ||
123 | |||
124 | spin_lock(&cache->lock); | ||
125 | |||
126 | if (entry->length < 0) | ||
127 | entry->error = entry->length; | ||
128 | |||
129 | entry->pending = 0; | ||
130 | |||
131 | /* | ||
132 | * While filling this entry one or more other processes | ||
133 | * have looked it up in the cache, and have slept | ||
134 | * waiting for it to become available. | ||
135 | */ | ||
136 | if (entry->num_waiters) { | ||
137 | spin_unlock(&cache->lock); | ||
138 | wake_up_all(&entry->wait_queue); | ||
139 | } else | ||
140 | spin_unlock(&cache->lock); | ||
141 | |||
142 | goto out; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Block already in cache. Increment refcount so it doesn't | ||
147 | * get reused until we're finished with it, if it was | ||
148 | * previously unused there's one less cache entry available | ||
149 | * for reuse. | ||
150 | */ | ||
151 | entry = &cache->entry[i]; | ||
152 | if (entry->refcount == 0) | ||
153 | cache->unused--; | ||
154 | entry->refcount++; | ||
155 | |||
156 | /* | ||
157 | * If the entry is currently being filled in by another process | ||
158 | * go to sleep waiting for it to become available. | ||
159 | */ | ||
160 | if (entry->pending) { | ||
161 | entry->num_waiters++; | ||
162 | spin_unlock(&cache->lock); | ||
163 | wait_event(entry->wait_queue, !entry->pending); | ||
164 | } else | ||
165 | spin_unlock(&cache->lock); | ||
166 | |||
167 | goto out; | ||
168 | } | ||
169 | |||
170 | out: | ||
171 | TRACE("Got %s %d, start block %lld, refcount %d, error %d\n", | ||
172 | cache->name, i, entry->block, entry->refcount, entry->error); | ||
173 | |||
174 | if (entry->error) | ||
175 | ERROR("Unable to read %s cache entry [%llx]\n", cache->name, | ||
176 | block); | ||
177 | return entry; | ||
178 | } | ||
179 | |||
180 | |||
181 | /* | ||
182 | * Release cache entry, once usage count is zero it can be reused. | ||
183 | */ | ||
184 | void squashfs_cache_put(struct squashfs_cache_entry *entry) | ||
185 | { | ||
186 | struct squashfs_cache *cache = entry->cache; | ||
187 | |||
188 | spin_lock(&cache->lock); | ||
189 | entry->refcount--; | ||
190 | if (entry->refcount == 0) { | ||
191 | cache->unused++; | ||
192 | /* | ||
193 | * If there's any processes waiting for a block to become | ||
194 | * available, wake one up. | ||
195 | */ | ||
196 | if (cache->num_waiters) { | ||
197 | spin_unlock(&cache->lock); | ||
198 | wake_up(&cache->wait_queue); | ||
199 | return; | ||
200 | } | ||
201 | } | ||
202 | spin_unlock(&cache->lock); | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * Delete cache reclaiming all kmalloced buffers. | ||
207 | */ | ||
208 | void squashfs_cache_delete(struct squashfs_cache *cache) | ||
209 | { | ||
210 | int i, j; | ||
211 | |||
212 | if (cache == NULL) | ||
213 | return; | ||
214 | |||
215 | for (i = 0; i < cache->entries; i++) { | ||
216 | if (cache->entry[i].data) { | ||
217 | for (j = 0; j < cache->pages; j++) | ||
218 | kfree(cache->entry[i].data[j]); | ||
219 | kfree(cache->entry[i].data); | ||
220 | } | ||
221 | } | ||
222 | |||
223 | kfree(cache->entry); | ||
224 | kfree(cache); | ||
225 | } | ||
226 | |||
227 | |||
228 | /* | ||
229 | * Initialise cache allocating the specified number of entries, each of | ||
230 | * size block_size. To avoid vmalloc fragmentation issues each entry | ||
231 | * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. | ||
232 | */ | ||
233 | struct squashfs_cache *squashfs_cache_init(char *name, int entries, | ||
234 | int block_size) | ||
235 | { | ||
236 | int i, j; | ||
237 | struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); | ||
238 | |||
239 | if (cache == NULL) { | ||
240 | ERROR("Failed to allocate %s cache\n", name); | ||
241 | return NULL; | ||
242 | } | ||
243 | |||
244 | cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); | ||
245 | if (cache->entry == NULL) { | ||
246 | ERROR("Failed to allocate %s cache\n", name); | ||
247 | goto cleanup; | ||
248 | } | ||
249 | |||
250 | cache->next_blk = 0; | ||
251 | cache->unused = entries; | ||
252 | cache->entries = entries; | ||
253 | cache->block_size = block_size; | ||
254 | cache->pages = block_size >> PAGE_CACHE_SHIFT; | ||
255 | cache->name = name; | ||
256 | cache->num_waiters = 0; | ||
257 | spin_lock_init(&cache->lock); | ||
258 | init_waitqueue_head(&cache->wait_queue); | ||
259 | |||
260 | for (i = 0; i < entries; i++) { | ||
261 | struct squashfs_cache_entry *entry = &cache->entry[i]; | ||
262 | |||
263 | init_waitqueue_head(&cache->entry[i].wait_queue); | ||
264 | entry->cache = cache; | ||
265 | entry->block = SQUASHFS_INVALID_BLK; | ||
266 | entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); | ||
267 | if (entry->data == NULL) { | ||
268 | ERROR("Failed to allocate %s cache entry\n", name); | ||
269 | goto cleanup; | ||
270 | } | ||
271 | |||
272 | for (j = 0; j < cache->pages; j++) { | ||
273 | entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); | ||
274 | if (entry->data[j] == NULL) { | ||
275 | ERROR("Failed to allocate %s buffer\n", name); | ||
276 | goto cleanup; | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | |||
281 | return cache; | ||
282 | |||
283 | cleanup: | ||
284 | squashfs_cache_delete(cache); | ||
285 | return NULL; | ||
286 | } | ||
287 | |||
288 | |||
289 | /* | ||
290 | * Copy upto length bytes from cache entry to buffer starting at offset bytes | ||
291 | * into the cache entry. If there's not length bytes then copy the number of | ||
292 | * bytes available. In all cases return the number of bytes copied. | ||
293 | */ | ||
294 | int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, | ||
295 | int offset, int length) | ||
296 | { | ||
297 | int remaining = length; | ||
298 | |||
299 | if (length == 0) | ||
300 | return 0; | ||
301 | else if (buffer == NULL) | ||
302 | return min(length, entry->length - offset); | ||
303 | |||
304 | while (offset < entry->length) { | ||
305 | void *buff = entry->data[offset / PAGE_CACHE_SIZE] | ||
306 | + (offset % PAGE_CACHE_SIZE); | ||
307 | int bytes = min_t(int, entry->length - offset, | ||
308 | PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); | ||
309 | |||
310 | if (bytes >= remaining) { | ||
311 | memcpy(buffer, buff, remaining); | ||
312 | remaining = 0; | ||
313 | break; | ||
314 | } | ||
315 | |||
316 | memcpy(buffer, buff, bytes); | ||
317 | buffer += bytes; | ||
318 | remaining -= bytes; | ||
319 | offset += bytes; | ||
320 | } | ||
321 | |||
322 | return length - remaining; | ||
323 | } | ||
324 | |||
325 | |||
326 | /* | ||
327 | * Read length bytes from metadata position <block, offset> (block is the | ||
328 | * start of the compressed block on disk, and offset is the offset into | ||
329 | * the block once decompressed). Data is packed into consecutive blocks, | ||
330 | * and length bytes may require reading more than one block. | ||
331 | */ | ||
332 | int squashfs_read_metadata(struct super_block *sb, void *buffer, | ||
333 | u64 *block, int *offset, int length) | ||
334 | { | ||
335 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
336 | int bytes, copied = length; | ||
337 | struct squashfs_cache_entry *entry; | ||
338 | |||
339 | TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); | ||
340 | |||
341 | while (length) { | ||
342 | entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); | ||
343 | if (entry->error) | ||
344 | return entry->error; | ||
345 | else if (*offset >= entry->length) | ||
346 | return -EIO; | ||
347 | |||
348 | bytes = squashfs_copy_data(buffer, entry, *offset, length); | ||
349 | if (buffer) | ||
350 | buffer += bytes; | ||
351 | length -= bytes; | ||
352 | *offset += bytes; | ||
353 | |||
354 | if (*offset == entry->length) { | ||
355 | *block = entry->next_index; | ||
356 | *offset = 0; | ||
357 | } | ||
358 | |||
359 | squashfs_cache_put(entry); | ||
360 | } | ||
361 | |||
362 | return copied; | ||
363 | } | ||
364 | |||
365 | |||
366 | /* | ||
367 | * Look-up in the fragmment cache the fragment located at <start_block> in the | ||
368 | * filesystem. If necessary read and decompress it from disk. | ||
369 | */ | ||
370 | struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb, | ||
371 | u64 start_block, int length) | ||
372 | { | ||
373 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
374 | |||
375 | return squashfs_cache_get(sb, msblk->fragment_cache, start_block, | ||
376 | length); | ||
377 | } | ||
378 | |||
379 | |||
380 | /* | ||
381 | * Read and decompress the datablock located at <start_block> in the | ||
382 | * filesystem. The cache is used here to avoid duplicating locking and | ||
383 | * read/decompress code. | ||
384 | */ | ||
385 | struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, | ||
386 | u64 start_block, int length) | ||
387 | { | ||
388 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
389 | |||
390 | return squashfs_cache_get(sb, msblk->read_page, start_block, length); | ||
391 | } | ||
392 | |||
393 | |||
394 | /* | ||
395 | * Read a filesystem table (uncompressed sequence of bytes) from disk | ||
396 | */ | ||
397 | int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, | ||
398 | int length) | ||
399 | { | ||
400 | int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
401 | int i, res; | ||
402 | void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL); | ||
403 | if (data == NULL) | ||
404 | return -ENOMEM; | ||
405 | |||
406 | for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) | ||
407 | data[i] = buffer; | ||
408 | res = squashfs_read_data(sb, data, block, length | | ||
409 | SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); | ||
410 | kfree(data); | ||
411 | return res; | ||
412 | } | ||
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c new file mode 100644 index 000000000000..566b0eaed868 --- /dev/null +++ b/fs/squashfs/dir.c | |||
@@ -0,0 +1,235 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * dir.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to read directories from disk. | ||
26 | * | ||
27 | * See namei.c for a description of directory organisation on disk. | ||
28 | */ | ||
29 | |||
30 | #include <linux/fs.h> | ||
31 | #include <linux/vfs.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/zlib.h> | ||
34 | |||
35 | #include "squashfs_fs.h" | ||
36 | #include "squashfs_fs_sb.h" | ||
37 | #include "squashfs_fs_i.h" | ||
38 | #include "squashfs.h" | ||
39 | |||
40 | static const unsigned char squashfs_filetype_table[] = { | ||
41 | DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK | ||
42 | }; | ||
43 | |||
44 | /* | ||
45 | * Lookup offset (f_pos) in the directory index, returning the | ||
46 | * metadata block containing it. | ||
47 | * | ||
48 | * If we get an error reading the index then return the part of the index | ||
49 | * (if any) we have managed to read - the index isn't essential, just | ||
50 | * quicker. | ||
51 | */ | ||
52 | static int get_dir_index_using_offset(struct super_block *sb, | ||
53 | u64 *next_block, int *next_offset, u64 index_start, int index_offset, | ||
54 | int i_count, u64 f_pos) | ||
55 | { | ||
56 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
57 | int err, i, index, length = 0; | ||
58 | struct squashfs_dir_index dir_index; | ||
59 | |||
60 | TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", | ||
61 | i_count, f_pos); | ||
62 | |||
63 | /* | ||
64 | * Translate from external f_pos to the internal f_pos. This | ||
65 | * is offset by 3 because we invent "." and ".." entries which are | ||
66 | * not actually stored in the directory. | ||
67 | */ | ||
68 | if (f_pos < 3) | ||
69 | return f_pos; | ||
70 | f_pos -= 3; | ||
71 | |||
72 | for (i = 0; i < i_count; i++) { | ||
73 | err = squashfs_read_metadata(sb, &dir_index, &index_start, | ||
74 | &index_offset, sizeof(dir_index)); | ||
75 | if (err < 0) | ||
76 | break; | ||
77 | |||
78 | index = le32_to_cpu(dir_index.index); | ||
79 | if (index > f_pos) | ||
80 | /* | ||
81 | * Found the index we're looking for. | ||
82 | */ | ||
83 | break; | ||
84 | |||
85 | err = squashfs_read_metadata(sb, NULL, &index_start, | ||
86 | &index_offset, le32_to_cpu(dir_index.size) + 1); | ||
87 | if (err < 0) | ||
88 | break; | ||
89 | |||
90 | length = index; | ||
91 | *next_block = le32_to_cpu(dir_index.start_block) + | ||
92 | msblk->directory_table; | ||
93 | } | ||
94 | |||
95 | *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; | ||
96 | |||
97 | /* | ||
98 | * Translate back from internal f_pos to external f_pos. | ||
99 | */ | ||
100 | return length + 3; | ||
101 | } | ||
102 | |||
103 | |||
104 | static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) | ||
105 | { | ||
106 | struct inode *inode = file->f_dentry->d_inode; | ||
107 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
108 | u64 block = squashfs_i(inode)->start + msblk->directory_table; | ||
109 | int offset = squashfs_i(inode)->offset, length = 0, dir_count, size, | ||
110 | type, err; | ||
111 | unsigned int inode_number; | ||
112 | struct squashfs_dir_header dirh; | ||
113 | struct squashfs_dir_entry *dire; | ||
114 | |||
115 | TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); | ||
116 | |||
117 | dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); | ||
118 | if (dire == NULL) { | ||
119 | ERROR("Failed to allocate squashfs_dir_entry\n"); | ||
120 | goto finish; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Return "." and ".." entries as the first two filenames in the | ||
125 | * directory. To maximise compression these two entries are not | ||
126 | * stored in the directory, and so we invent them here. | ||
127 | * | ||
128 | * It also means that the external f_pos is offset by 3 from the | ||
129 | * on-disk directory f_pos. | ||
130 | */ | ||
131 | while (file->f_pos < 3) { | ||
132 | char *name; | ||
133 | int i_ino; | ||
134 | |||
135 | if (file->f_pos == 0) { | ||
136 | name = "."; | ||
137 | size = 1; | ||
138 | i_ino = inode->i_ino; | ||
139 | } else { | ||
140 | name = ".."; | ||
141 | size = 2; | ||
142 | i_ino = squashfs_i(inode)->parent; | ||
143 | } | ||
144 | |||
145 | TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", | ||
146 | dirent, name, size, file->f_pos, i_ino, | ||
147 | squashfs_filetype_table[1]); | ||
148 | |||
149 | if (filldir(dirent, name, size, file->f_pos, i_ino, | ||
150 | squashfs_filetype_table[1]) < 0) { | ||
151 | TRACE("Filldir returned less than 0\n"); | ||
152 | goto finish; | ||
153 | } | ||
154 | |||
155 | file->f_pos += size; | ||
156 | } | ||
157 | |||
158 | length = get_dir_index_using_offset(inode->i_sb, &block, &offset, | ||
159 | squashfs_i(inode)->dir_idx_start, | ||
160 | squashfs_i(inode)->dir_idx_offset, | ||
161 | squashfs_i(inode)->dir_idx_cnt, | ||
162 | file->f_pos); | ||
163 | |||
164 | while (length < i_size_read(inode)) { | ||
165 | /* | ||
166 | * Read directory header | ||
167 | */ | ||
168 | err = squashfs_read_metadata(inode->i_sb, &dirh, &block, | ||
169 | &offset, sizeof(dirh)); | ||
170 | if (err < 0) | ||
171 | goto failed_read; | ||
172 | |||
173 | length += sizeof(dirh); | ||
174 | |||
175 | dir_count = le32_to_cpu(dirh.count) + 1; | ||
176 | while (dir_count--) { | ||
177 | /* | ||
178 | * Read directory entry. | ||
179 | */ | ||
180 | err = squashfs_read_metadata(inode->i_sb, dire, &block, | ||
181 | &offset, sizeof(*dire)); | ||
182 | if (err < 0) | ||
183 | goto failed_read; | ||
184 | |||
185 | size = le16_to_cpu(dire->size) + 1; | ||
186 | |||
187 | err = squashfs_read_metadata(inode->i_sb, dire->name, | ||
188 | &block, &offset, size); | ||
189 | if (err < 0) | ||
190 | goto failed_read; | ||
191 | |||
192 | length += sizeof(*dire) + size; | ||
193 | |||
194 | if (file->f_pos >= length) | ||
195 | continue; | ||
196 | |||
197 | dire->name[size] = '\0'; | ||
198 | inode_number = le32_to_cpu(dirh.inode_number) + | ||
199 | ((short) le16_to_cpu(dire->inode_number)); | ||
200 | type = le16_to_cpu(dire->type); | ||
201 | |||
202 | TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" | ||
203 | "\n", dirent, dire->name, size, | ||
204 | file->f_pos, | ||
205 | le32_to_cpu(dirh.start_block), | ||
206 | le16_to_cpu(dire->offset), | ||
207 | inode_number, | ||
208 | squashfs_filetype_table[type]); | ||
209 | |||
210 | if (filldir(dirent, dire->name, size, file->f_pos, | ||
211 | inode_number, | ||
212 | squashfs_filetype_table[type]) < 0) { | ||
213 | TRACE("Filldir returned less than 0\n"); | ||
214 | goto finish; | ||
215 | } | ||
216 | |||
217 | file->f_pos = length; | ||
218 | } | ||
219 | } | ||
220 | |||
221 | finish: | ||
222 | kfree(dire); | ||
223 | return 0; | ||
224 | |||
225 | failed_read: | ||
226 | ERROR("Unable to read directory block [%llx:%x]\n", block, offset); | ||
227 | kfree(dire); | ||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | |||
232 | const struct file_operations squashfs_dir_ops = { | ||
233 | .read = generic_read_dir, | ||
234 | .readdir = squashfs_readdir | ||
235 | }; | ||
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c new file mode 100644 index 000000000000..69e971d5ddc1 --- /dev/null +++ b/fs/squashfs/export.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * export.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to make Squashfs filesystems exportable (NFS etc.) | ||
26 | * | ||
27 | * The export code uses an inode lookup table to map inode numbers passed in | ||
28 | * filehandles to an inode location on disk. This table is stored compressed | ||
29 | * into metadata blocks. A second index table is used to locate these. This | ||
30 | * second index table for speed of access (and because it is small) is read at | ||
31 | * mount time and cached in memory. | ||
32 | * | ||
33 | * The inode lookup table is used only by the export code, inode disk | ||
34 | * locations are directly encoded in directories, enabling direct access | ||
35 | * without an intermediate lookup for all operations except the export ops. | ||
36 | */ | ||
37 | |||
38 | #include <linux/fs.h> | ||
39 | #include <linux/vfs.h> | ||
40 | #include <linux/dcache.h> | ||
41 | #include <linux/exportfs.h> | ||
42 | #include <linux/zlib.h> | ||
43 | |||
44 | #include "squashfs_fs.h" | ||
45 | #include "squashfs_fs_sb.h" | ||
46 | #include "squashfs_fs_i.h" | ||
47 | #include "squashfs.h" | ||
48 | |||
49 | /* | ||
50 | * Look-up inode number (ino) in table, returning the inode location. | ||
51 | */ | ||
52 | static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) | ||
53 | { | ||
54 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
55 | int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); | ||
56 | int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); | ||
57 | u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); | ||
58 | __le64 ino; | ||
59 | int err; | ||
60 | |||
61 | TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); | ||
62 | |||
63 | err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); | ||
64 | if (err < 0) | ||
65 | return err; | ||
66 | |||
67 | TRACE("squashfs_inode_lookup, inode = 0x%llx\n", | ||
68 | (u64) le64_to_cpu(ino)); | ||
69 | |||
70 | return le64_to_cpu(ino); | ||
71 | } | ||
72 | |||
73 | |||
74 | static struct dentry *squashfs_export_iget(struct super_block *sb, | ||
75 | unsigned int ino_num) | ||
76 | { | ||
77 | long long ino; | ||
78 | struct dentry *dentry = ERR_PTR(-ENOENT); | ||
79 | |||
80 | TRACE("Entered squashfs_export_iget\n"); | ||
81 | |||
82 | ino = squashfs_inode_lookup(sb, ino_num); | ||
83 | if (ino >= 0) | ||
84 | dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); | ||
85 | |||
86 | return dentry; | ||
87 | } | ||
88 | |||
89 | |||
90 | static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, | ||
91 | struct fid *fid, int fh_len, int fh_type) | ||
92 | { | ||
93 | if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) | ||
94 | || fh_len < 2) | ||
95 | return NULL; | ||
96 | |||
97 | return squashfs_export_iget(sb, fid->i32.ino); | ||
98 | } | ||
99 | |||
100 | |||
101 | static struct dentry *squashfs_fh_to_parent(struct super_block *sb, | ||
102 | struct fid *fid, int fh_len, int fh_type) | ||
103 | { | ||
104 | if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) | ||
105 | return NULL; | ||
106 | |||
107 | return squashfs_export_iget(sb, fid->i32.parent_ino); | ||
108 | } | ||
109 | |||
110 | |||
111 | static struct dentry *squashfs_get_parent(struct dentry *child) | ||
112 | { | ||
113 | struct inode *inode = child->d_inode; | ||
114 | unsigned int parent_ino = squashfs_i(inode)->parent; | ||
115 | |||
116 | return squashfs_export_iget(inode->i_sb, parent_ino); | ||
117 | } | ||
118 | |||
119 | |||
120 | /* | ||
121 | * Read uncompressed inode lookup table indexes off disk into memory | ||
122 | */ | ||
123 | __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, | ||
124 | u64 lookup_table_start, unsigned int inodes) | ||
125 | { | ||
126 | unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); | ||
127 | __le64 *inode_lookup_table; | ||
128 | int err; | ||
129 | |||
130 | TRACE("In read_inode_lookup_table, length %d\n", length); | ||
131 | |||
132 | /* Allocate inode lookup table indexes */ | ||
133 | inode_lookup_table = kmalloc(length, GFP_KERNEL); | ||
134 | if (inode_lookup_table == NULL) { | ||
135 | ERROR("Failed to allocate inode lookup table\n"); | ||
136 | return ERR_PTR(-ENOMEM); | ||
137 | } | ||
138 | |||
139 | err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start, | ||
140 | length); | ||
141 | if (err < 0) { | ||
142 | ERROR("unable to read inode lookup table\n"); | ||
143 | kfree(inode_lookup_table); | ||
144 | return ERR_PTR(err); | ||
145 | } | ||
146 | |||
147 | return inode_lookup_table; | ||
148 | } | ||
149 | |||
150 | |||
151 | const struct export_operations squashfs_export_ops = { | ||
152 | .fh_to_dentry = squashfs_fh_to_dentry, | ||
153 | .fh_to_parent = squashfs_fh_to_parent, | ||
154 | .get_parent = squashfs_get_parent | ||
155 | }; | ||
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c new file mode 100644 index 000000000000..717767d831df --- /dev/null +++ b/fs/squashfs/file.c | |||
@@ -0,0 +1,502 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * file.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file contains code for handling regular files. A regular file | ||
26 | * consists of a sequence of contiguous compressed blocks, and/or a | ||
27 | * compressed fragment block (tail-end packed block). The compressed size | ||
28 | * of each datablock is stored in a block list contained within the | ||
29 | * file inode (itself stored in one or more compressed metadata blocks). | ||
30 | * | ||
31 | * To speed up access to datablocks when reading 'large' files (256 Mbytes or | ||
32 | * larger), the code implements an index cache that caches the mapping from | ||
33 | * block index to datablock location on disk. | ||
34 | * | ||
35 | * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while | ||
36 | * retaining a simple and space-efficient block list on disk. The cache | ||
37 | * is split into slots, caching up to eight 224 GiB files (128 KiB blocks). | ||
38 | * Larger files use multiple slots, with 1.75 TiB files using all 8 slots. | ||
39 | * The index cache is designed to be memory efficient, and by default uses | ||
40 | * 16 KiB. | ||
41 | */ | ||
42 | |||
43 | #include <linux/fs.h> | ||
44 | #include <linux/vfs.h> | ||
45 | #include <linux/kernel.h> | ||
46 | #include <linux/slab.h> | ||
47 | #include <linux/string.h> | ||
48 | #include <linux/pagemap.h> | ||
49 | #include <linux/mutex.h> | ||
50 | #include <linux/zlib.h> | ||
51 | |||
52 | #include "squashfs_fs.h" | ||
53 | #include "squashfs_fs_sb.h" | ||
54 | #include "squashfs_fs_i.h" | ||
55 | #include "squashfs.h" | ||
56 | |||
57 | /* | ||
58 | * Locate cache slot in range [offset, index] for specified inode. If | ||
59 | * there's more than one return the slot closest to index. | ||
60 | */ | ||
61 | static struct meta_index *locate_meta_index(struct inode *inode, int offset, | ||
62 | int index) | ||
63 | { | ||
64 | struct meta_index *meta = NULL; | ||
65 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
66 | int i; | ||
67 | |||
68 | mutex_lock(&msblk->meta_index_mutex); | ||
69 | |||
70 | TRACE("locate_meta_index: index %d, offset %d\n", index, offset); | ||
71 | |||
72 | if (msblk->meta_index == NULL) | ||
73 | goto not_allocated; | ||
74 | |||
75 | for (i = 0; i < SQUASHFS_META_SLOTS; i++) { | ||
76 | if (msblk->meta_index[i].inode_number == inode->i_ino && | ||
77 | msblk->meta_index[i].offset >= offset && | ||
78 | msblk->meta_index[i].offset <= index && | ||
79 | msblk->meta_index[i].locked == 0) { | ||
80 | TRACE("locate_meta_index: entry %d, offset %d\n", i, | ||
81 | msblk->meta_index[i].offset); | ||
82 | meta = &msblk->meta_index[i]; | ||
83 | offset = meta->offset; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | if (meta) | ||
88 | meta->locked = 1; | ||
89 | |||
90 | not_allocated: | ||
91 | mutex_unlock(&msblk->meta_index_mutex); | ||
92 | |||
93 | return meta; | ||
94 | } | ||
95 | |||
96 | |||
97 | /* | ||
98 | * Find and initialise an empty cache slot for index offset. | ||
99 | */ | ||
100 | static struct meta_index *empty_meta_index(struct inode *inode, int offset, | ||
101 | int skip) | ||
102 | { | ||
103 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
104 | struct meta_index *meta = NULL; | ||
105 | int i; | ||
106 | |||
107 | mutex_lock(&msblk->meta_index_mutex); | ||
108 | |||
109 | TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip); | ||
110 | |||
111 | if (msblk->meta_index == NULL) { | ||
112 | /* | ||
113 | * First time cache index has been used, allocate and | ||
114 | * initialise. The cache index could be allocated at | ||
115 | * mount time but doing it here means it is allocated only | ||
116 | * if a 'large' file is read. | ||
117 | */ | ||
118 | msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS, | ||
119 | sizeof(*(msblk->meta_index)), GFP_KERNEL); | ||
120 | if (msblk->meta_index == NULL) { | ||
121 | ERROR("Failed to allocate meta_index\n"); | ||
122 | goto failed; | ||
123 | } | ||
124 | for (i = 0; i < SQUASHFS_META_SLOTS; i++) { | ||
125 | msblk->meta_index[i].inode_number = 0; | ||
126 | msblk->meta_index[i].locked = 0; | ||
127 | } | ||
128 | msblk->next_meta_index = 0; | ||
129 | } | ||
130 | |||
131 | for (i = SQUASHFS_META_SLOTS; i && | ||
132 | msblk->meta_index[msblk->next_meta_index].locked; i--) | ||
133 | msblk->next_meta_index = (msblk->next_meta_index + 1) % | ||
134 | SQUASHFS_META_SLOTS; | ||
135 | |||
136 | if (i == 0) { | ||
137 | TRACE("empty_meta_index: failed!\n"); | ||
138 | goto failed; | ||
139 | } | ||
140 | |||
141 | TRACE("empty_meta_index: returned meta entry %d, %p\n", | ||
142 | msblk->next_meta_index, | ||
143 | &msblk->meta_index[msblk->next_meta_index]); | ||
144 | |||
145 | meta = &msblk->meta_index[msblk->next_meta_index]; | ||
146 | msblk->next_meta_index = (msblk->next_meta_index + 1) % | ||
147 | SQUASHFS_META_SLOTS; | ||
148 | |||
149 | meta->inode_number = inode->i_ino; | ||
150 | meta->offset = offset; | ||
151 | meta->skip = skip; | ||
152 | meta->entries = 0; | ||
153 | meta->locked = 1; | ||
154 | |||
155 | failed: | ||
156 | mutex_unlock(&msblk->meta_index_mutex); | ||
157 | return meta; | ||
158 | } | ||
159 | |||
160 | |||
161 | static void release_meta_index(struct inode *inode, struct meta_index *meta) | ||
162 | { | ||
163 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
164 | mutex_lock(&msblk->meta_index_mutex); | ||
165 | meta->locked = 0; | ||
166 | mutex_unlock(&msblk->meta_index_mutex); | ||
167 | } | ||
168 | |||
169 | |||
170 | /* | ||
171 | * Read the next n blocks from the block list, starting from | ||
172 | * metadata block <start_block, offset>. | ||
173 | */ | ||
174 | static long long read_indexes(struct super_block *sb, int n, | ||
175 | u64 *start_block, int *offset) | ||
176 | { | ||
177 | int err, i; | ||
178 | long long block = 0; | ||
179 | __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); | ||
180 | |||
181 | if (blist == NULL) { | ||
182 | ERROR("read_indexes: Failed to allocate block_list\n"); | ||
183 | return -ENOMEM; | ||
184 | } | ||
185 | |||
186 | while (n) { | ||
187 | int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); | ||
188 | |||
189 | err = squashfs_read_metadata(sb, blist, start_block, | ||
190 | offset, blocks << 2); | ||
191 | if (err < 0) { | ||
192 | ERROR("read_indexes: reading block [%llx:%x]\n", | ||
193 | *start_block, *offset); | ||
194 | goto failure; | ||
195 | } | ||
196 | |||
197 | for (i = 0; i < blocks; i++) { | ||
198 | int size = le32_to_cpu(blist[i]); | ||
199 | block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); | ||
200 | } | ||
201 | n -= blocks; | ||
202 | } | ||
203 | |||
204 | kfree(blist); | ||
205 | return block; | ||
206 | |||
207 | failure: | ||
208 | kfree(blist); | ||
209 | return err; | ||
210 | } | ||
211 | |||
212 | |||
213 | /* | ||
214 | * Each cache index slot has SQUASHFS_META_ENTRIES, each of which | ||
215 | * can cache one index -> datablock/blocklist-block mapping. We wish | ||
216 | * to distribute these over the length of the file, entry[0] maps index x, | ||
217 | * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on. | ||
218 | * The larger the file, the greater the skip factor. The skip factor is | ||
219 | * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure | ||
220 | * the number of metadata blocks that need to be read fits into the cache. | ||
221 | * If the skip factor is limited in this way then the file will use multiple | ||
222 | * slots. | ||
223 | */ | ||
224 | static inline int calculate_skip(int blocks) | ||
225 | { | ||
226 | int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) | ||
227 | * SQUASHFS_META_INDEXES); | ||
228 | return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); | ||
229 | } | ||
230 | |||
231 | |||
232 | /* | ||
233 | * Search and grow the index cache for the specified inode, returning the | ||
234 | * on-disk locations of the datablock and block list metadata block | ||
235 | * <index_block, index_offset> for index (scaled to nearest cache index). | ||
236 | */ | ||
237 | static int fill_meta_index(struct inode *inode, int index, | ||
238 | u64 *index_block, int *index_offset, u64 *data_block) | ||
239 | { | ||
240 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
241 | int skip = calculate_skip(i_size_read(inode) >> msblk->block_log); | ||
242 | int offset = 0; | ||
243 | struct meta_index *meta; | ||
244 | struct meta_entry *meta_entry; | ||
245 | u64 cur_index_block = squashfs_i(inode)->block_list_start; | ||
246 | int cur_offset = squashfs_i(inode)->offset; | ||
247 | u64 cur_data_block = squashfs_i(inode)->start; | ||
248 | int err, i; | ||
249 | |||
250 | /* | ||
251 | * Scale index to cache index (cache slot entry) | ||
252 | */ | ||
253 | index /= SQUASHFS_META_INDEXES * skip; | ||
254 | |||
255 | while (offset < index) { | ||
256 | meta = locate_meta_index(inode, offset + 1, index); | ||
257 | |||
258 | if (meta == NULL) { | ||
259 | meta = empty_meta_index(inode, offset + 1, skip); | ||
260 | if (meta == NULL) | ||
261 | goto all_done; | ||
262 | } else { | ||
263 | offset = index < meta->offset + meta->entries ? index : | ||
264 | meta->offset + meta->entries - 1; | ||
265 | meta_entry = &meta->meta_entry[offset - meta->offset]; | ||
266 | cur_index_block = meta_entry->index_block + | ||
267 | msblk->inode_table; | ||
268 | cur_offset = meta_entry->offset; | ||
269 | cur_data_block = meta_entry->data_block; | ||
270 | TRACE("get_meta_index: offset %d, meta->offset %d, " | ||
271 | "meta->entries %d\n", offset, meta->offset, | ||
272 | meta->entries); | ||
273 | TRACE("get_meta_index: index_block 0x%llx, offset 0x%x" | ||
274 | " data_block 0x%llx\n", cur_index_block, | ||
275 | cur_offset, cur_data_block); | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * If necessary grow cache slot by reading block list. Cache | ||
280 | * slot is extended up to index or to the end of the slot, in | ||
281 | * which case further slots will be used. | ||
282 | */ | ||
283 | for (i = meta->offset + meta->entries; i <= index && | ||
284 | i < meta->offset + SQUASHFS_META_ENTRIES; i++) { | ||
285 | int blocks = skip * SQUASHFS_META_INDEXES; | ||
286 | long long res = read_indexes(inode->i_sb, blocks, | ||
287 | &cur_index_block, &cur_offset); | ||
288 | |||
289 | if (res < 0) { | ||
290 | if (meta->entries == 0) | ||
291 | /* | ||
292 | * Don't leave an empty slot on read | ||
293 | * error allocated to this inode... | ||
294 | */ | ||
295 | meta->inode_number = 0; | ||
296 | err = res; | ||
297 | goto failed; | ||
298 | } | ||
299 | |||
300 | cur_data_block += res; | ||
301 | meta_entry = &meta->meta_entry[i - meta->offset]; | ||
302 | meta_entry->index_block = cur_index_block - | ||
303 | msblk->inode_table; | ||
304 | meta_entry->offset = cur_offset; | ||
305 | meta_entry->data_block = cur_data_block; | ||
306 | meta->entries++; | ||
307 | offset++; | ||
308 | } | ||
309 | |||
310 | TRACE("get_meta_index: meta->offset %d, meta->entries %d\n", | ||
311 | meta->offset, meta->entries); | ||
312 | |||
313 | release_meta_index(inode, meta); | ||
314 | } | ||
315 | |||
316 | all_done: | ||
317 | *index_block = cur_index_block; | ||
318 | *index_offset = cur_offset; | ||
319 | *data_block = cur_data_block; | ||
320 | |||
321 | /* | ||
322 | * Scale cache index (cache slot entry) to index | ||
323 | */ | ||
324 | return offset * SQUASHFS_META_INDEXES * skip; | ||
325 | |||
326 | failed: | ||
327 | release_meta_index(inode, meta); | ||
328 | return err; | ||
329 | } | ||
330 | |||
331 | |||
332 | /* | ||
333 | * Get the on-disk location and compressed size of the datablock | ||
334 | * specified by index. Fill_meta_index() does most of the work. | ||
335 | */ | ||
336 | static int read_blocklist(struct inode *inode, int index, u64 *block) | ||
337 | { | ||
338 | u64 start; | ||
339 | long long blks; | ||
340 | int offset; | ||
341 | __le32 size; | ||
342 | int res = fill_meta_index(inode, index, &start, &offset, block); | ||
343 | |||
344 | TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" | ||
345 | " 0x%x, block 0x%llx\n", res, index, start, offset, | ||
346 | *block); | ||
347 | |||
348 | if (res < 0) | ||
349 | return res; | ||
350 | |||
351 | /* | ||
352 | * res contains the index of the mapping returned by fill_meta_index(), | ||
353 | * this will likely be less than the desired index (because the | ||
354 | * meta_index cache works at a higher granularity). Read any | ||
355 | * extra block indexes needed. | ||
356 | */ | ||
357 | if (res < index) { | ||
358 | blks = read_indexes(inode->i_sb, index - res, &start, &offset); | ||
359 | if (blks < 0) | ||
360 | return (int) blks; | ||
361 | *block += blks; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Read length of block specified by index. | ||
366 | */ | ||
367 | res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, | ||
368 | sizeof(size)); | ||
369 | if (res < 0) | ||
370 | return res; | ||
371 | return le32_to_cpu(size); | ||
372 | } | ||
373 | |||
374 | |||
375 | static int squashfs_readpage(struct file *file, struct page *page) | ||
376 | { | ||
377 | struct inode *inode = page->mapping->host; | ||
378 | struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; | ||
379 | int bytes, i, offset = 0, sparse = 0; | ||
380 | struct squashfs_cache_entry *buffer = NULL; | ||
381 | void *pageaddr; | ||
382 | |||
383 | int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; | ||
384 | int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); | ||
385 | int start_index = page->index & ~mask; | ||
386 | int end_index = start_index | mask; | ||
387 | int file_end = i_size_read(inode) >> msblk->block_log; | ||
388 | |||
389 | TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", | ||
390 | page->index, squashfs_i(inode)->start); | ||
391 | |||
392 | if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> | ||
393 | PAGE_CACHE_SHIFT)) | ||
394 | goto out; | ||
395 | |||
396 | if (index < file_end || squashfs_i(inode)->fragment_block == | ||
397 | SQUASHFS_INVALID_BLK) { | ||
398 | /* | ||
399 | * Reading a datablock from disk. Need to read block list | ||
400 | * to get location and block size. | ||
401 | */ | ||
402 | u64 block = 0; | ||
403 | int bsize = read_blocklist(inode, index, &block); | ||
404 | if (bsize < 0) | ||
405 | goto error_out; | ||
406 | |||
407 | if (bsize == 0) { /* hole */ | ||
408 | bytes = index == file_end ? | ||
409 | (i_size_read(inode) & (msblk->block_size - 1)) : | ||
410 | msblk->block_size; | ||
411 | sparse = 1; | ||
412 | } else { | ||
413 | /* | ||
414 | * Read and decompress datablock. | ||
415 | */ | ||
416 | buffer = squashfs_get_datablock(inode->i_sb, | ||
417 | block, bsize); | ||
418 | if (buffer->error) { | ||
419 | ERROR("Unable to read page, block %llx, size %x" | ||
420 | "\n", block, bsize); | ||
421 | squashfs_cache_put(buffer); | ||
422 | goto error_out; | ||
423 | } | ||
424 | bytes = buffer->length; | ||
425 | } | ||
426 | } else { | ||
427 | /* | ||
428 | * Datablock is stored inside a fragment (tail-end packed | ||
429 | * block). | ||
430 | */ | ||
431 | buffer = squashfs_get_fragment(inode->i_sb, | ||
432 | squashfs_i(inode)->fragment_block, | ||
433 | squashfs_i(inode)->fragment_size); | ||
434 | |||
435 | if (buffer->error) { | ||
436 | ERROR("Unable to read page, block %llx, size %x\n", | ||
437 | squashfs_i(inode)->fragment_block, | ||
438 | squashfs_i(inode)->fragment_size); | ||
439 | squashfs_cache_put(buffer); | ||
440 | goto error_out; | ||
441 | } | ||
442 | bytes = i_size_read(inode) & (msblk->block_size - 1); | ||
443 | offset = squashfs_i(inode)->fragment_offset; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Loop copying datablock into pages. As the datablock likely covers | ||
448 | * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly | ||
449 | * grab the pages from the page cache, except for the page that we've | ||
450 | * been called to fill. | ||
451 | */ | ||
452 | for (i = start_index; i <= end_index && bytes > 0; i++, | ||
453 | bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { | ||
454 | struct page *push_page; | ||
455 | int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); | ||
456 | |||
457 | TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); | ||
458 | |||
459 | push_page = (i == page->index) ? page : | ||
460 | grab_cache_page_nowait(page->mapping, i); | ||
461 | |||
462 | if (!push_page) | ||
463 | continue; | ||
464 | |||
465 | if (PageUptodate(push_page)) | ||
466 | goto skip_page; | ||
467 | |||
468 | pageaddr = kmap_atomic(push_page, KM_USER0); | ||
469 | squashfs_copy_data(pageaddr, buffer, offset, avail); | ||
470 | memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); | ||
471 | kunmap_atomic(pageaddr, KM_USER0); | ||
472 | flush_dcache_page(push_page); | ||
473 | SetPageUptodate(push_page); | ||
474 | skip_page: | ||
475 | unlock_page(push_page); | ||
476 | if (i != page->index) | ||
477 | page_cache_release(push_page); | ||
478 | } | ||
479 | |||
480 | if (!sparse) | ||
481 | squashfs_cache_put(buffer); | ||
482 | |||
483 | return 0; | ||
484 | |||
485 | error_out: | ||
486 | SetPageError(page); | ||
487 | out: | ||
488 | pageaddr = kmap_atomic(page, KM_USER0); | ||
489 | memset(pageaddr, 0, PAGE_CACHE_SIZE); | ||
490 | kunmap_atomic(pageaddr, KM_USER0); | ||
491 | flush_dcache_page(page); | ||
492 | if (!PageError(page)) | ||
493 | SetPageUptodate(page); | ||
494 | unlock_page(page); | ||
495 | |||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | |||
500 | const struct address_space_operations squashfs_aops = { | ||
501 | .readpage = squashfs_readpage | ||
502 | }; | ||
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c new file mode 100644 index 000000000000..b5a2c15bbbc7 --- /dev/null +++ b/fs/squashfs/fragment.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * fragment.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to handle compressed fragments (tail-end packed | ||
26 | * datablocks). | ||
27 | * | ||
28 | * Regular files contain a fragment index which is mapped to a fragment | ||
29 | * location on disk and compressed size using a fragment lookup table. | ||
30 | * Like everything in Squashfs this fragment lookup table is itself stored | ||
31 | * compressed into metadata blocks. A second index table is used to locate | ||
32 | * these. This second index table for speed of access (and because it | ||
33 | * is small) is read at mount time and cached in memory. | ||
34 | */ | ||
35 | |||
36 | #include <linux/fs.h> | ||
37 | #include <linux/vfs.h> | ||
38 | #include <linux/slab.h> | ||
39 | #include <linux/zlib.h> | ||
40 | |||
41 | #include "squashfs_fs.h" | ||
42 | #include "squashfs_fs_sb.h" | ||
43 | #include "squashfs_fs_i.h" | ||
44 | #include "squashfs.h" | ||
45 | |||
46 | /* | ||
47 | * Look-up fragment using the fragment index table. Return the on disk | ||
48 | * location of the fragment and its compressed size | ||
49 | */ | ||
50 | int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, | ||
51 | u64 *fragment_block) | ||
52 | { | ||
53 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
54 | int block = SQUASHFS_FRAGMENT_INDEX(fragment); | ||
55 | int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); | ||
56 | u64 start_block = le64_to_cpu(msblk->fragment_index[block]); | ||
57 | struct squashfs_fragment_entry fragment_entry; | ||
58 | int size; | ||
59 | |||
60 | size = squashfs_read_metadata(sb, &fragment_entry, &start_block, | ||
61 | &offset, sizeof(fragment_entry)); | ||
62 | if (size < 0) | ||
63 | return size; | ||
64 | |||
65 | *fragment_block = le64_to_cpu(fragment_entry.start_block); | ||
66 | size = le32_to_cpu(fragment_entry.size); | ||
67 | |||
68 | return size; | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | * Read the uncompressed fragment lookup table indexes off disk into memory | ||
74 | */ | ||
75 | __le64 *squashfs_read_fragment_index_table(struct super_block *sb, | ||
76 | u64 fragment_table_start, unsigned int fragments) | ||
77 | { | ||
78 | unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); | ||
79 | __le64 *fragment_index; | ||
80 | int err; | ||
81 | |||
82 | /* Allocate fragment lookup table indexes */ | ||
83 | fragment_index = kmalloc(length, GFP_KERNEL); | ||
84 | if (fragment_index == NULL) { | ||
85 | ERROR("Failed to allocate fragment index table\n"); | ||
86 | return ERR_PTR(-ENOMEM); | ||
87 | } | ||
88 | |||
89 | err = squashfs_read_table(sb, fragment_index, fragment_table_start, | ||
90 | length); | ||
91 | if (err < 0) { | ||
92 | ERROR("unable to read fragment index table\n"); | ||
93 | kfree(fragment_index); | ||
94 | return ERR_PTR(err); | ||
95 | } | ||
96 | |||
97 | return fragment_index; | ||
98 | } | ||
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c new file mode 100644 index 000000000000..3795b837ba28 --- /dev/null +++ b/fs/squashfs/id.c | |||
@@ -0,0 +1,94 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * id.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to handle uids and gids. | ||
26 | * | ||
27 | * For space efficiency regular files store uid and gid indexes, which are | ||
28 | * converted to 32-bit uids/gids using an id look up table. This table is | ||
29 | * stored compressed into metadata blocks. A second index table is used to | ||
30 | * locate these. This second index table for speed of access (and because it | ||
31 | * is small) is read at mount time and cached in memory. | ||
32 | */ | ||
33 | |||
34 | #include <linux/fs.h> | ||
35 | #include <linux/vfs.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/zlib.h> | ||
38 | |||
39 | #include "squashfs_fs.h" | ||
40 | #include "squashfs_fs_sb.h" | ||
41 | #include "squashfs_fs_i.h" | ||
42 | #include "squashfs.h" | ||
43 | |||
44 | /* | ||
45 | * Map uid/gid index into real 32-bit uid/gid using the id look up table | ||
46 | */ | ||
47 | int squashfs_get_id(struct super_block *sb, unsigned int index, | ||
48 | unsigned int *id) | ||
49 | { | ||
50 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
51 | int block = SQUASHFS_ID_BLOCK(index); | ||
52 | int offset = SQUASHFS_ID_BLOCK_OFFSET(index); | ||
53 | u64 start_block = le64_to_cpu(msblk->id_table[block]); | ||
54 | __le32 disk_id; | ||
55 | int err; | ||
56 | |||
57 | err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, | ||
58 | sizeof(disk_id)); | ||
59 | if (err < 0) | ||
60 | return err; | ||
61 | |||
62 | *id = le32_to_cpu(disk_id); | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | |||
67 | /* | ||
68 | * Read uncompressed id lookup table indexes from disk into memory | ||
69 | */ | ||
70 | __le64 *squashfs_read_id_index_table(struct super_block *sb, | ||
71 | u64 id_table_start, unsigned short no_ids) | ||
72 | { | ||
73 | unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); | ||
74 | __le64 *id_table; | ||
75 | int err; | ||
76 | |||
77 | TRACE("In read_id_index_table, length %d\n", length); | ||
78 | |||
79 | /* Allocate id lookup table indexes */ | ||
80 | id_table = kmalloc(length, GFP_KERNEL); | ||
81 | if (id_table == NULL) { | ||
82 | ERROR("Failed to allocate id index table\n"); | ||
83 | return ERR_PTR(-ENOMEM); | ||
84 | } | ||
85 | |||
86 | err = squashfs_read_table(sb, id_table, id_table_start, length); | ||
87 | if (err < 0) { | ||
88 | ERROR("unable to read id index table\n"); | ||
89 | kfree(id_table); | ||
90 | return ERR_PTR(err); | ||
91 | } | ||
92 | |||
93 | return id_table; | ||
94 | } | ||
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c new file mode 100644 index 000000000000..7a63398bb855 --- /dev/null +++ b/fs/squashfs/inode.c | |||
@@ -0,0 +1,346 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * inode.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to create and read inodes from disk. | ||
26 | * | ||
27 | * Inodes in Squashfs are identified by a 48-bit inode which encodes the | ||
28 | * location of the compressed metadata block containing the inode, and the byte | ||
29 | * offset into that block where the inode is placed (<block, offset>). | ||
30 | * | ||
31 | * To maximise compression there are different inodes for each file type | ||
32 | * (regular file, directory, device, etc.), the inode contents and length | ||
33 | * varying with the type. | ||
34 | * | ||
35 | * To further maximise compression, two types of regular file inode and | ||
36 | * directory inode are defined: inodes optimised for frequently occurring | ||
37 | * regular files and directories, and extended types where extra | ||
38 | * information has to be stored. | ||
39 | */ | ||
40 | |||
41 | #include <linux/fs.h> | ||
42 | #include <linux/vfs.h> | ||
43 | #include <linux/zlib.h> | ||
44 | |||
45 | #include "squashfs_fs.h" | ||
46 | #include "squashfs_fs_sb.h" | ||
47 | #include "squashfs_fs_i.h" | ||
48 | #include "squashfs.h" | ||
49 | |||
50 | /* | ||
51 | * Initialise VFS inode with the base inode information common to all | ||
52 | * Squashfs inode types. Sqsh_ino contains the unswapped base inode | ||
53 | * off disk. | ||
54 | */ | ||
55 | static int squashfs_new_inode(struct super_block *sb, struct inode *inode, | ||
56 | struct squashfs_base_inode *sqsh_ino) | ||
57 | { | ||
58 | int err; | ||
59 | |||
60 | err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); | ||
61 | if (err) | ||
62 | return err; | ||
63 | |||
64 | err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); | ||
65 | if (err) | ||
66 | return err; | ||
67 | |||
68 | inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); | ||
69 | inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); | ||
70 | inode->i_atime.tv_sec = inode->i_mtime.tv_sec; | ||
71 | inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; | ||
72 | inode->i_mode = le16_to_cpu(sqsh_ino->mode); | ||
73 | inode->i_size = 0; | ||
74 | |||
75 | return err; | ||
76 | } | ||
77 | |||
78 | |||
79 | struct inode *squashfs_iget(struct super_block *sb, long long ino, | ||
80 | unsigned int ino_number) | ||
81 | { | ||
82 | struct inode *inode = iget_locked(sb, ino_number); | ||
83 | int err; | ||
84 | |||
85 | TRACE("Entered squashfs_iget\n"); | ||
86 | |||
87 | if (!inode) | ||
88 | return ERR_PTR(-ENOMEM); | ||
89 | if (!(inode->i_state & I_NEW)) | ||
90 | return inode; | ||
91 | |||
92 | err = squashfs_read_inode(inode, ino); | ||
93 | if (err) { | ||
94 | iget_failed(inode); | ||
95 | return ERR_PTR(err); | ||
96 | } | ||
97 | |||
98 | unlock_new_inode(inode); | ||
99 | return inode; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Initialise VFS inode by reading inode from inode table (compressed | ||
105 | * metadata). The format and amount of data read depends on type. | ||
106 | */ | ||
107 | int squashfs_read_inode(struct inode *inode, long long ino) | ||
108 | { | ||
109 | struct super_block *sb = inode->i_sb; | ||
110 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
111 | u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; | ||
112 | int err, type, offset = SQUASHFS_INODE_OFFSET(ino); | ||
113 | union squashfs_inode squashfs_ino; | ||
114 | struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; | ||
115 | |||
116 | TRACE("Entered squashfs_read_inode\n"); | ||
117 | |||
118 | /* | ||
119 | * Read inode base common to all inode types. | ||
120 | */ | ||
121 | err = squashfs_read_metadata(sb, sqshb_ino, &block, | ||
122 | &offset, sizeof(*sqshb_ino)); | ||
123 | if (err < 0) | ||
124 | goto failed_read; | ||
125 | |||
126 | err = squashfs_new_inode(sb, inode, sqshb_ino); | ||
127 | if (err) | ||
128 | goto failed_read; | ||
129 | |||
130 | block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; | ||
131 | offset = SQUASHFS_INODE_OFFSET(ino); | ||
132 | |||
133 | type = le16_to_cpu(sqshb_ino->inode_type); | ||
134 | switch (type) { | ||
135 | case SQUASHFS_REG_TYPE: { | ||
136 | unsigned int frag_offset, frag_size, frag; | ||
137 | u64 frag_blk; | ||
138 | struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; | ||
139 | |||
140 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
141 | sizeof(*sqsh_ino)); | ||
142 | if (err < 0) | ||
143 | goto failed_read; | ||
144 | |||
145 | frag = le32_to_cpu(sqsh_ino->fragment); | ||
146 | if (frag != SQUASHFS_INVALID_FRAG) { | ||
147 | frag_offset = le32_to_cpu(sqsh_ino->offset); | ||
148 | frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); | ||
149 | if (frag_size < 0) { | ||
150 | err = frag_size; | ||
151 | goto failed_read; | ||
152 | } | ||
153 | } else { | ||
154 | frag_blk = SQUASHFS_INVALID_BLK; | ||
155 | frag_size = 0; | ||
156 | frag_offset = 0; | ||
157 | } | ||
158 | |||
159 | inode->i_nlink = 1; | ||
160 | inode->i_size = le32_to_cpu(sqsh_ino->file_size); | ||
161 | inode->i_fop = &generic_ro_fops; | ||
162 | inode->i_mode |= S_IFREG; | ||
163 | inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; | ||
164 | squashfs_i(inode)->fragment_block = frag_blk; | ||
165 | squashfs_i(inode)->fragment_size = frag_size; | ||
166 | squashfs_i(inode)->fragment_offset = frag_offset; | ||
167 | squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); | ||
168 | squashfs_i(inode)->block_list_start = block; | ||
169 | squashfs_i(inode)->offset = offset; | ||
170 | inode->i_data.a_ops = &squashfs_aops; | ||
171 | |||
172 | TRACE("File inode %x:%x, start_block %llx, block_list_start " | ||
173 | "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), | ||
174 | offset, squashfs_i(inode)->start, block, offset); | ||
175 | break; | ||
176 | } | ||
177 | case SQUASHFS_LREG_TYPE: { | ||
178 | unsigned int frag_offset, frag_size, frag; | ||
179 | u64 frag_blk; | ||
180 | struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; | ||
181 | |||
182 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
183 | sizeof(*sqsh_ino)); | ||
184 | if (err < 0) | ||
185 | goto failed_read; | ||
186 | |||
187 | frag = le32_to_cpu(sqsh_ino->fragment); | ||
188 | if (frag != SQUASHFS_INVALID_FRAG) { | ||
189 | frag_offset = le32_to_cpu(sqsh_ino->offset); | ||
190 | frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); | ||
191 | if (frag_size < 0) { | ||
192 | err = frag_size; | ||
193 | goto failed_read; | ||
194 | } | ||
195 | } else { | ||
196 | frag_blk = SQUASHFS_INVALID_BLK; | ||
197 | frag_size = 0; | ||
198 | frag_offset = 0; | ||
199 | } | ||
200 | |||
201 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
202 | inode->i_size = le64_to_cpu(sqsh_ino->file_size); | ||
203 | inode->i_fop = &generic_ro_fops; | ||
204 | inode->i_mode |= S_IFREG; | ||
205 | inode->i_blocks = ((inode->i_size - | ||
206 | le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; | ||
207 | |||
208 | squashfs_i(inode)->fragment_block = frag_blk; | ||
209 | squashfs_i(inode)->fragment_size = frag_size; | ||
210 | squashfs_i(inode)->fragment_offset = frag_offset; | ||
211 | squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); | ||
212 | squashfs_i(inode)->block_list_start = block; | ||
213 | squashfs_i(inode)->offset = offset; | ||
214 | inode->i_data.a_ops = &squashfs_aops; | ||
215 | |||
216 | TRACE("File inode %x:%x, start_block %llx, block_list_start " | ||
217 | "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), | ||
218 | offset, squashfs_i(inode)->start, block, offset); | ||
219 | break; | ||
220 | } | ||
221 | case SQUASHFS_DIR_TYPE: { | ||
222 | struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; | ||
223 | |||
224 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
225 | sizeof(*sqsh_ino)); | ||
226 | if (err < 0) | ||
227 | goto failed_read; | ||
228 | |||
229 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
230 | inode->i_size = le16_to_cpu(sqsh_ino->file_size); | ||
231 | inode->i_op = &squashfs_dir_inode_ops; | ||
232 | inode->i_fop = &squashfs_dir_ops; | ||
233 | inode->i_mode |= S_IFDIR; | ||
234 | squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); | ||
235 | squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); | ||
236 | squashfs_i(inode)->dir_idx_cnt = 0; | ||
237 | squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); | ||
238 | |||
239 | TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", | ||
240 | SQUASHFS_INODE_BLK(ino), offset, | ||
241 | squashfs_i(inode)->start, | ||
242 | le16_to_cpu(sqsh_ino->offset)); | ||
243 | break; | ||
244 | } | ||
245 | case SQUASHFS_LDIR_TYPE: { | ||
246 | struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; | ||
247 | |||
248 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
249 | sizeof(*sqsh_ino)); | ||
250 | if (err < 0) | ||
251 | goto failed_read; | ||
252 | |||
253 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
254 | inode->i_size = le32_to_cpu(sqsh_ino->file_size); | ||
255 | inode->i_op = &squashfs_dir_inode_ops; | ||
256 | inode->i_fop = &squashfs_dir_ops; | ||
257 | inode->i_mode |= S_IFDIR; | ||
258 | squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); | ||
259 | squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); | ||
260 | squashfs_i(inode)->dir_idx_start = block; | ||
261 | squashfs_i(inode)->dir_idx_offset = offset; | ||
262 | squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); | ||
263 | squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); | ||
264 | |||
265 | TRACE("Long directory inode %x:%x, start_block %llx, offset " | ||
266 | "%x\n", SQUASHFS_INODE_BLK(ino), offset, | ||
267 | squashfs_i(inode)->start, | ||
268 | le16_to_cpu(sqsh_ino->offset)); | ||
269 | break; | ||
270 | } | ||
271 | case SQUASHFS_SYMLINK_TYPE: | ||
272 | case SQUASHFS_LSYMLINK_TYPE: { | ||
273 | struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; | ||
274 | |||
275 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
276 | sizeof(*sqsh_ino)); | ||
277 | if (err < 0) | ||
278 | goto failed_read; | ||
279 | |||
280 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
281 | inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); | ||
282 | inode->i_op = &page_symlink_inode_operations; | ||
283 | inode->i_data.a_ops = &squashfs_symlink_aops; | ||
284 | inode->i_mode |= S_IFLNK; | ||
285 | squashfs_i(inode)->start = block; | ||
286 | squashfs_i(inode)->offset = offset; | ||
287 | |||
288 | TRACE("Symbolic link inode %x:%x, start_block %llx, offset " | ||
289 | "%x\n", SQUASHFS_INODE_BLK(ino), offset, | ||
290 | block, offset); | ||
291 | break; | ||
292 | } | ||
293 | case SQUASHFS_BLKDEV_TYPE: | ||
294 | case SQUASHFS_CHRDEV_TYPE: | ||
295 | case SQUASHFS_LBLKDEV_TYPE: | ||
296 | case SQUASHFS_LCHRDEV_TYPE: { | ||
297 | struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; | ||
298 | unsigned int rdev; | ||
299 | |||
300 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
301 | sizeof(*sqsh_ino)); | ||
302 | if (err < 0) | ||
303 | goto failed_read; | ||
304 | |||
305 | if (type == SQUASHFS_CHRDEV_TYPE) | ||
306 | inode->i_mode |= S_IFCHR; | ||
307 | else | ||
308 | inode->i_mode |= S_IFBLK; | ||
309 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
310 | rdev = le32_to_cpu(sqsh_ino->rdev); | ||
311 | init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); | ||
312 | |||
313 | TRACE("Device inode %x:%x, rdev %x\n", | ||
314 | SQUASHFS_INODE_BLK(ino), offset, rdev); | ||
315 | break; | ||
316 | } | ||
317 | case SQUASHFS_FIFO_TYPE: | ||
318 | case SQUASHFS_SOCKET_TYPE: | ||
319 | case SQUASHFS_LFIFO_TYPE: | ||
320 | case SQUASHFS_LSOCKET_TYPE: { | ||
321 | struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; | ||
322 | |||
323 | err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, | ||
324 | sizeof(*sqsh_ino)); | ||
325 | if (err < 0) | ||
326 | goto failed_read; | ||
327 | |||
328 | if (type == SQUASHFS_FIFO_TYPE) | ||
329 | inode->i_mode |= S_IFIFO; | ||
330 | else | ||
331 | inode->i_mode |= S_IFSOCK; | ||
332 | inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); | ||
333 | init_special_inode(inode, inode->i_mode, 0); | ||
334 | break; | ||
335 | } | ||
336 | default: | ||
337 | ERROR("Unknown inode type %d in squashfs_iget!\n", type); | ||
338 | return -EINVAL; | ||
339 | } | ||
340 | |||
341 | return 0; | ||
342 | |||
343 | failed_read: | ||
344 | ERROR("Unable to read inode 0x%llx\n", ino); | ||
345 | return err; | ||
346 | } | ||
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c new file mode 100644 index 000000000000..9e398653b22b --- /dev/null +++ b/fs/squashfs/namei.c | |||
@@ -0,0 +1,242 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * namei.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to do filename lookup in directories. | ||
26 | * | ||
27 | * Like inodes, directories are packed into compressed metadata blocks, stored | ||
28 | * in a directory table. Directories are accessed using the start address of | ||
29 | * the metablock containing the directory and the offset into the | ||
30 | * decompressed block (<block, offset>). | ||
31 | * | ||
32 | * Directories are organised in a slightly complex way, and are not simply | ||
33 | * a list of file names. The organisation takes advantage of the | ||
34 | * fact that (in most cases) the inodes of the files will be in the same | ||
35 | * compressed metadata block, and therefore, can share the start block. | ||
36 | * Directories are therefore organised in a two level list, a directory | ||
37 | * header containing the shared start block value, and a sequence of directory | ||
38 | * entries, each of which share the shared start block. A new directory header | ||
39 | * is written once/if the inode start block changes. The directory | ||
40 | * header/directory entry list is repeated as many times as necessary. | ||
41 | * | ||
42 | * Directories are sorted, and can contain a directory index to speed up | ||
43 | * file lookup. Directory indexes store one entry per metablock, each entry | ||
44 | * storing the index/filename mapping to the first directory header | ||
45 | * in each metadata block. Directories are sorted in alphabetical order, | ||
46 | * and at lookup the index is scanned linearly looking for the first filename | ||
47 | * alphabetically larger than the filename being looked up. At this point the | ||
48 | * location of the metadata block the filename is in has been found. | ||
49 | * The general idea of the index is ensure only one metadata block needs to be | ||
50 | * decompressed to do a lookup irrespective of the length of the directory. | ||
51 | * This scheme has the advantage that it doesn't require extra memory overhead | ||
52 | * and doesn't require much extra storage on disk. | ||
53 | */ | ||
54 | |||
55 | #include <linux/fs.h> | ||
56 | #include <linux/vfs.h> | ||
57 | #include <linux/slab.h> | ||
58 | #include <linux/string.h> | ||
59 | #include <linux/dcache.h> | ||
60 | #include <linux/zlib.h> | ||
61 | |||
62 | #include "squashfs_fs.h" | ||
63 | #include "squashfs_fs_sb.h" | ||
64 | #include "squashfs_fs_i.h" | ||
65 | #include "squashfs.h" | ||
66 | |||
67 | /* | ||
68 | * Lookup name in the directory index, returning the location of the metadata | ||
69 | * block containing it, and the directory index this represents. | ||
70 | * | ||
71 | * If we get an error reading the index then return the part of the index | ||
72 | * (if any) we have managed to read - the index isn't essential, just | ||
73 | * quicker. | ||
74 | */ | ||
75 | static int get_dir_index_using_name(struct super_block *sb, | ||
76 | u64 *next_block, int *next_offset, u64 index_start, | ||
77 | int index_offset, int i_count, const char *name, | ||
78 | int len) | ||
79 | { | ||
80 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
81 | int i, size, length = 0, err; | ||
82 | struct squashfs_dir_index *index; | ||
83 | char *str; | ||
84 | |||
85 | TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); | ||
86 | |||
87 | index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); | ||
88 | if (index == NULL) { | ||
89 | ERROR("Failed to allocate squashfs_dir_index\n"); | ||
90 | goto out; | ||
91 | } | ||
92 | |||
93 | str = &index->name[SQUASHFS_NAME_LEN + 1]; | ||
94 | strncpy(str, name, len); | ||
95 | str[len] = '\0'; | ||
96 | |||
97 | for (i = 0; i < i_count; i++) { | ||
98 | err = squashfs_read_metadata(sb, index, &index_start, | ||
99 | &index_offset, sizeof(*index)); | ||
100 | if (err < 0) | ||
101 | break; | ||
102 | |||
103 | |||
104 | size = le32_to_cpu(index->size) + 1; | ||
105 | |||
106 | err = squashfs_read_metadata(sb, index->name, &index_start, | ||
107 | &index_offset, size); | ||
108 | if (err < 0) | ||
109 | break; | ||
110 | |||
111 | index->name[size] = '\0'; | ||
112 | |||
113 | if (strcmp(index->name, str) > 0) | ||
114 | break; | ||
115 | |||
116 | length = le32_to_cpu(index->index); | ||
117 | *next_block = le32_to_cpu(index->start_block) + | ||
118 | msblk->directory_table; | ||
119 | } | ||
120 | |||
121 | *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; | ||
122 | kfree(index); | ||
123 | |||
124 | out: | ||
125 | /* | ||
126 | * Return index (f_pos) of the looked up metadata block. Translate | ||
127 | * from internal f_pos to external f_pos which is offset by 3 because | ||
128 | * we invent "." and ".." entries which are not actually stored in the | ||
129 | * directory. | ||
130 | */ | ||
131 | return length + 3; | ||
132 | } | ||
133 | |||
134 | |||
135 | static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, | ||
136 | struct nameidata *nd) | ||
137 | { | ||
138 | const unsigned char *name = dentry->d_name.name; | ||
139 | int len = dentry->d_name.len; | ||
140 | struct inode *inode = NULL; | ||
141 | struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; | ||
142 | struct squashfs_dir_header dirh; | ||
143 | struct squashfs_dir_entry *dire; | ||
144 | u64 block = squashfs_i(dir)->start + msblk->directory_table; | ||
145 | int offset = squashfs_i(dir)->offset; | ||
146 | int err, length = 0, dir_count, size; | ||
147 | |||
148 | TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); | ||
149 | |||
150 | dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); | ||
151 | if (dire == NULL) { | ||
152 | ERROR("Failed to allocate squashfs_dir_entry\n"); | ||
153 | return ERR_PTR(-ENOMEM); | ||
154 | } | ||
155 | |||
156 | if (len > SQUASHFS_NAME_LEN) { | ||
157 | err = -ENAMETOOLONG; | ||
158 | goto failed; | ||
159 | } | ||
160 | |||
161 | length = get_dir_index_using_name(dir->i_sb, &block, &offset, | ||
162 | squashfs_i(dir)->dir_idx_start, | ||
163 | squashfs_i(dir)->dir_idx_offset, | ||
164 | squashfs_i(dir)->dir_idx_cnt, name, len); | ||
165 | |||
166 | while (length < i_size_read(dir)) { | ||
167 | /* | ||
168 | * Read directory header. | ||
169 | */ | ||
170 | err = squashfs_read_metadata(dir->i_sb, &dirh, &block, | ||
171 | &offset, sizeof(dirh)); | ||
172 | if (err < 0) | ||
173 | goto read_failure; | ||
174 | |||
175 | length += sizeof(dirh); | ||
176 | |||
177 | dir_count = le32_to_cpu(dirh.count) + 1; | ||
178 | while (dir_count--) { | ||
179 | /* | ||
180 | * Read directory entry. | ||
181 | */ | ||
182 | err = squashfs_read_metadata(dir->i_sb, dire, &block, | ||
183 | &offset, sizeof(*dire)); | ||
184 | if (err < 0) | ||
185 | goto read_failure; | ||
186 | |||
187 | size = le16_to_cpu(dire->size) + 1; | ||
188 | |||
189 | err = squashfs_read_metadata(dir->i_sb, dire->name, | ||
190 | &block, &offset, size); | ||
191 | if (err < 0) | ||
192 | goto read_failure; | ||
193 | |||
194 | length += sizeof(*dire) + size; | ||
195 | |||
196 | if (name[0] < dire->name[0]) | ||
197 | goto exit_lookup; | ||
198 | |||
199 | if (len == size && !strncmp(name, dire->name, len)) { | ||
200 | unsigned int blk, off, ino_num; | ||
201 | long long ino; | ||
202 | blk = le32_to_cpu(dirh.start_block); | ||
203 | off = le16_to_cpu(dire->offset); | ||
204 | ino_num = le32_to_cpu(dirh.inode_number) + | ||
205 | (short) le16_to_cpu(dire->inode_number); | ||
206 | ino = SQUASHFS_MKINODE(blk, off); | ||
207 | |||
208 | TRACE("calling squashfs_iget for directory " | ||
209 | "entry %s, inode %x:%x, %d\n", name, | ||
210 | blk, off, ino_num); | ||
211 | |||
212 | inode = squashfs_iget(dir->i_sb, ino, ino_num); | ||
213 | if (IS_ERR(inode)) { | ||
214 | err = PTR_ERR(inode); | ||
215 | goto failed; | ||
216 | } | ||
217 | |||
218 | goto exit_lookup; | ||
219 | } | ||
220 | } | ||
221 | } | ||
222 | |||
223 | exit_lookup: | ||
224 | kfree(dire); | ||
225 | if (inode) | ||
226 | return d_splice_alias(inode, dentry); | ||
227 | d_add(dentry, inode); | ||
228 | return ERR_PTR(0); | ||
229 | |||
230 | read_failure: | ||
231 | ERROR("Unable to read directory block [%llx:%x]\n", | ||
232 | squashfs_i(dir)->start + msblk->directory_table, | ||
233 | squashfs_i(dir)->offset); | ||
234 | failed: | ||
235 | kfree(dire); | ||
236 | return ERR_PTR(err); | ||
237 | } | ||
238 | |||
239 | |||
240 | const struct inode_operations squashfs_dir_inode_ops = { | ||
241 | .lookup = squashfs_lookup | ||
242 | }; | ||
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h new file mode 100644 index 000000000000..6b2515d027d5 --- /dev/null +++ b/fs/squashfs/squashfs.h | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * squashfs.h | ||
22 | */ | ||
23 | |||
24 | #define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args) | ||
25 | |||
26 | #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) | ||
27 | |||
28 | #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) | ||
29 | |||
30 | static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) | ||
31 | { | ||
32 | return list_entry(inode, struct squashfs_inode_info, vfs_inode); | ||
33 | } | ||
34 | |||
35 | /* block.c */ | ||
36 | extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, | ||
37 | int); | ||
38 | |||
39 | /* cache.c */ | ||
40 | extern struct squashfs_cache *squashfs_cache_init(char *, int, int); | ||
41 | extern void squashfs_cache_delete(struct squashfs_cache *); | ||
42 | extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *, | ||
43 | struct squashfs_cache *, u64, int); | ||
44 | extern void squashfs_cache_put(struct squashfs_cache_entry *); | ||
45 | extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int); | ||
46 | extern int squashfs_read_metadata(struct super_block *, void *, u64 *, | ||
47 | int *, int); | ||
48 | extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, | ||
49 | u64, int); | ||
50 | extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, | ||
51 | u64, int); | ||
52 | extern int squashfs_read_table(struct super_block *, void *, u64, int); | ||
53 | |||
54 | /* export.c */ | ||
55 | extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, | ||
56 | unsigned int); | ||
57 | |||
58 | /* fragment.c */ | ||
59 | extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); | ||
60 | extern __le64 *squashfs_read_fragment_index_table(struct super_block *, | ||
61 | u64, unsigned int); | ||
62 | |||
63 | /* id.c */ | ||
64 | extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); | ||
65 | extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, | ||
66 | unsigned short); | ||
67 | |||
68 | /* inode.c */ | ||
69 | extern struct inode *squashfs_iget(struct super_block *, long long, | ||
70 | unsigned int); | ||
71 | extern int squashfs_read_inode(struct inode *, long long); | ||
72 | |||
73 | /* | ||
74 | * Inodes and files operations | ||
75 | */ | ||
76 | |||
77 | /* dir.c */ | ||
78 | extern const struct file_operations squashfs_dir_ops; | ||
79 | |||
80 | /* export.c */ | ||
81 | extern const struct export_operations squashfs_export_ops; | ||
82 | |||
83 | /* file.c */ | ||
84 | extern const struct address_space_operations squashfs_aops; | ||
85 | |||
86 | /* namei.c */ | ||
87 | extern const struct inode_operations squashfs_dir_inode_ops; | ||
88 | |||
89 | /* symlink.c */ | ||
90 | extern const struct address_space_operations squashfs_symlink_aops; | ||
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h new file mode 100644 index 000000000000..6840da1bf21e --- /dev/null +++ b/fs/squashfs/squashfs_fs.h | |||
@@ -0,0 +1,381 @@ | |||
1 | #ifndef SQUASHFS_FS | ||
2 | #define SQUASHFS_FS | ||
3 | /* | ||
4 | * Squashfs | ||
5 | * | ||
6 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
7 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version 2, | ||
12 | * or (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
22 | * | ||
23 | * squashfs_fs.h | ||
24 | */ | ||
25 | |||
26 | #define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE | ||
27 | #define SQUASHFS_MAJOR 4 | ||
28 | #define SQUASHFS_MINOR 0 | ||
29 | #define SQUASHFS_MAGIC 0x73717368 | ||
30 | #define SQUASHFS_START 0 | ||
31 | |||
32 | /* size of metadata (inode and directory) blocks */ | ||
33 | #define SQUASHFS_METADATA_SIZE 8192 | ||
34 | #define SQUASHFS_METADATA_LOG 13 | ||
35 | |||
36 | /* default size of data blocks */ | ||
37 | #define SQUASHFS_FILE_SIZE 131072 | ||
38 | #define SQUASHFS_FILE_LOG 17 | ||
39 | |||
40 | #define SQUASHFS_FILE_MAX_SIZE 1048576 | ||
41 | #define SQUASHFS_FILE_MAX_LOG 20 | ||
42 | |||
43 | /* Max number of uids and gids */ | ||
44 | #define SQUASHFS_IDS 65536 | ||
45 | |||
46 | /* Max length of filename (not 255) */ | ||
47 | #define SQUASHFS_NAME_LEN 256 | ||
48 | |||
49 | #define SQUASHFS_INVALID_FRAG (0xffffffffU) | ||
50 | #define SQUASHFS_INVALID_BLK (-1LL) | ||
51 | |||
52 | /* Filesystem flags */ | ||
53 | #define SQUASHFS_NOI 0 | ||
54 | #define SQUASHFS_NOD 1 | ||
55 | #define SQUASHFS_NOF 3 | ||
56 | #define SQUASHFS_NO_FRAG 4 | ||
57 | #define SQUASHFS_ALWAYS_FRAG 5 | ||
58 | #define SQUASHFS_DUPLICATE 6 | ||
59 | #define SQUASHFS_EXPORT 7 | ||
60 | |||
61 | #define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) | ||
62 | |||
63 | #define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \ | ||
64 | SQUASHFS_NOI) | ||
65 | |||
66 | #define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \ | ||
67 | SQUASHFS_NOD) | ||
68 | |||
69 | #define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ | ||
70 | SQUASHFS_NOF) | ||
71 | |||
72 | #define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ | ||
73 | SQUASHFS_NO_FRAG) | ||
74 | |||
75 | #define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ | ||
76 | SQUASHFS_ALWAYS_FRAG) | ||
77 | |||
78 | #define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \ | ||
79 | SQUASHFS_DUPLICATE) | ||
80 | |||
81 | #define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ | ||
82 | SQUASHFS_EXPORT) | ||
83 | |||
84 | /* Max number of types and file types */ | ||
85 | #define SQUASHFS_DIR_TYPE 1 | ||
86 | #define SQUASHFS_REG_TYPE 2 | ||
87 | #define SQUASHFS_SYMLINK_TYPE 3 | ||
88 | #define SQUASHFS_BLKDEV_TYPE 4 | ||
89 | #define SQUASHFS_CHRDEV_TYPE 5 | ||
90 | #define SQUASHFS_FIFO_TYPE 6 | ||
91 | #define SQUASHFS_SOCKET_TYPE 7 | ||
92 | #define SQUASHFS_LDIR_TYPE 8 | ||
93 | #define SQUASHFS_LREG_TYPE 9 | ||
94 | #define SQUASHFS_LSYMLINK_TYPE 10 | ||
95 | #define SQUASHFS_LBLKDEV_TYPE 11 | ||
96 | #define SQUASHFS_LCHRDEV_TYPE 12 | ||
97 | #define SQUASHFS_LFIFO_TYPE 13 | ||
98 | #define SQUASHFS_LSOCKET_TYPE 14 | ||
99 | |||
100 | /* Flag whether block is compressed or uncompressed, bit is set if block is | ||
101 | * uncompressed */ | ||
102 | #define SQUASHFS_COMPRESSED_BIT (1 << 15) | ||
103 | |||
104 | #define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \ | ||
105 | (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT) | ||
106 | |||
107 | #define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT)) | ||
108 | |||
109 | #define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24) | ||
110 | |||
111 | #define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \ | ||
112 | ~SQUASHFS_COMPRESSED_BIT_BLOCK) | ||
113 | |||
114 | #define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) | ||
115 | |||
116 | /* | ||
117 | * Inode number ops. Inodes consist of a compressed block number, and an | ||
118 | * uncompressed offset within that block | ||
119 | */ | ||
120 | #define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16)) | ||
121 | |||
122 | #define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff)) | ||
123 | |||
124 | #define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ | ||
125 | << 16) + (B))) | ||
126 | |||
127 | /* Translate between VFS mode and squashfs mode */ | ||
128 | #define SQUASHFS_MODE(A) ((A) & 0xfff) | ||
129 | |||
130 | /* fragment and fragment table defines */ | ||
131 | #define SQUASHFS_FRAGMENT_BYTES(A) \ | ||
132 | ((A) * sizeof(struct squashfs_fragment_entry)) | ||
133 | |||
134 | #define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \ | ||
135 | SQUASHFS_METADATA_SIZE) | ||
136 | |||
137 | #define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \ | ||
138 | SQUASHFS_METADATA_SIZE) | ||
139 | |||
140 | #define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \ | ||
141 | SQUASHFS_METADATA_SIZE - 1) / \ | ||
142 | SQUASHFS_METADATA_SIZE) | ||
143 | |||
144 | #define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\ | ||
145 | sizeof(u64)) | ||
146 | |||
147 | /* inode lookup table defines */ | ||
148 | #define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64)) | ||
149 | |||
150 | #define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \ | ||
151 | SQUASHFS_METADATA_SIZE) | ||
152 | |||
153 | #define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \ | ||
154 | SQUASHFS_METADATA_SIZE) | ||
155 | |||
156 | #define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \ | ||
157 | SQUASHFS_METADATA_SIZE - 1) / \ | ||
158 | SQUASHFS_METADATA_SIZE) | ||
159 | |||
160 | #define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\ | ||
161 | sizeof(u64)) | ||
162 | |||
163 | /* uid/gid lookup table defines */ | ||
164 | #define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int)) | ||
165 | |||
166 | #define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \ | ||
167 | SQUASHFS_METADATA_SIZE) | ||
168 | |||
169 | #define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \ | ||
170 | SQUASHFS_METADATA_SIZE) | ||
171 | |||
172 | #define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \ | ||
173 | SQUASHFS_METADATA_SIZE - 1) / \ | ||
174 | SQUASHFS_METADATA_SIZE) | ||
175 | |||
176 | #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ | ||
177 | sizeof(u64)) | ||
178 | |||
179 | /* cached data constants for filesystem */ | ||
180 | #define SQUASHFS_CACHED_BLKS 8 | ||
181 | |||
182 | #define SQUASHFS_MAX_FILE_SIZE_LOG 64 | ||
183 | |||
184 | #define SQUASHFS_MAX_FILE_SIZE (1LL << \ | ||
185 | (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) | ||
186 | |||
187 | #define SQUASHFS_MARKER_BYTE 0xff | ||
188 | |||
189 | /* meta index cache */ | ||
190 | #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) | ||
191 | #define SQUASHFS_META_ENTRIES 127 | ||
192 | #define SQUASHFS_META_SLOTS 8 | ||
193 | |||
194 | struct meta_entry { | ||
195 | u64 data_block; | ||
196 | unsigned int index_block; | ||
197 | unsigned short offset; | ||
198 | unsigned short pad; | ||
199 | }; | ||
200 | |||
201 | struct meta_index { | ||
202 | unsigned int inode_number; | ||
203 | unsigned int offset; | ||
204 | unsigned short entries; | ||
205 | unsigned short skip; | ||
206 | unsigned short locked; | ||
207 | unsigned short pad; | ||
208 | struct meta_entry meta_entry[SQUASHFS_META_ENTRIES]; | ||
209 | }; | ||
210 | |||
211 | |||
212 | /* | ||
213 | * definitions for structures on disk | ||
214 | */ | ||
215 | #define ZLIB_COMPRESSION 1 | ||
216 | |||
217 | struct squashfs_super_block { | ||
218 | __le32 s_magic; | ||
219 | __le32 inodes; | ||
220 | __le32 mkfs_time; | ||
221 | __le32 block_size; | ||
222 | __le32 fragments; | ||
223 | __le16 compression; | ||
224 | __le16 block_log; | ||
225 | __le16 flags; | ||
226 | __le16 no_ids; | ||
227 | __le16 s_major; | ||
228 | __le16 s_minor; | ||
229 | __le64 root_inode; | ||
230 | __le64 bytes_used; | ||
231 | __le64 id_table_start; | ||
232 | __le64 xattr_table_start; | ||
233 | __le64 inode_table_start; | ||
234 | __le64 directory_table_start; | ||
235 | __le64 fragment_table_start; | ||
236 | __le64 lookup_table_start; | ||
237 | }; | ||
238 | |||
239 | struct squashfs_dir_index { | ||
240 | __le32 index; | ||
241 | __le32 start_block; | ||
242 | __le32 size; | ||
243 | unsigned char name[0]; | ||
244 | }; | ||
245 | |||
246 | struct squashfs_base_inode { | ||
247 | __le16 inode_type; | ||
248 | __le16 mode; | ||
249 | __le16 uid; | ||
250 | __le16 guid; | ||
251 | __le32 mtime; | ||
252 | __le32 inode_number; | ||
253 | }; | ||
254 | |||
255 | struct squashfs_ipc_inode { | ||
256 | __le16 inode_type; | ||
257 | __le16 mode; | ||
258 | __le16 uid; | ||
259 | __le16 guid; | ||
260 | __le32 mtime; | ||
261 | __le32 inode_number; | ||
262 | __le32 nlink; | ||
263 | }; | ||
264 | |||
265 | struct squashfs_dev_inode { | ||
266 | __le16 inode_type; | ||
267 | __le16 mode; | ||
268 | __le16 uid; | ||
269 | __le16 guid; | ||
270 | __le32 mtime; | ||
271 | __le32 inode_number; | ||
272 | __le32 nlink; | ||
273 | __le32 rdev; | ||
274 | }; | ||
275 | |||
276 | struct squashfs_symlink_inode { | ||
277 | __le16 inode_type; | ||
278 | __le16 mode; | ||
279 | __le16 uid; | ||
280 | __le16 guid; | ||
281 | __le32 mtime; | ||
282 | __le32 inode_number; | ||
283 | __le32 nlink; | ||
284 | __le32 symlink_size; | ||
285 | char symlink[0]; | ||
286 | }; | ||
287 | |||
288 | struct squashfs_reg_inode { | ||
289 | __le16 inode_type; | ||
290 | __le16 mode; | ||
291 | __le16 uid; | ||
292 | __le16 guid; | ||
293 | __le32 mtime; | ||
294 | __le32 inode_number; | ||
295 | __le32 start_block; | ||
296 | __le32 fragment; | ||
297 | __le32 offset; | ||
298 | __le32 file_size; | ||
299 | __le16 block_list[0]; | ||
300 | }; | ||
301 | |||
302 | struct squashfs_lreg_inode { | ||
303 | __le16 inode_type; | ||
304 | __le16 mode; | ||
305 | __le16 uid; | ||
306 | __le16 guid; | ||
307 | __le32 mtime; | ||
308 | __le32 inode_number; | ||
309 | __le64 start_block; | ||
310 | __le64 file_size; | ||
311 | __le64 sparse; | ||
312 | __le32 nlink; | ||
313 | __le32 fragment; | ||
314 | __le32 offset; | ||
315 | __le32 xattr; | ||
316 | __le16 block_list[0]; | ||
317 | }; | ||
318 | |||
319 | struct squashfs_dir_inode { | ||
320 | __le16 inode_type; | ||
321 | __le16 mode; | ||
322 | __le16 uid; | ||
323 | __le16 guid; | ||
324 | __le32 mtime; | ||
325 | __le32 inode_number; | ||
326 | __le32 start_block; | ||
327 | __le32 nlink; | ||
328 | __le16 file_size; | ||
329 | __le16 offset; | ||
330 | __le32 parent_inode; | ||
331 | }; | ||
332 | |||
333 | struct squashfs_ldir_inode { | ||
334 | __le16 inode_type; | ||
335 | __le16 mode; | ||
336 | __le16 uid; | ||
337 | __le16 guid; | ||
338 | __le32 mtime; | ||
339 | __le32 inode_number; | ||
340 | __le32 nlink; | ||
341 | __le32 file_size; | ||
342 | __le32 start_block; | ||
343 | __le32 parent_inode; | ||
344 | __le16 i_count; | ||
345 | __le16 offset; | ||
346 | __le32 xattr; | ||
347 | struct squashfs_dir_index index[0]; | ||
348 | }; | ||
349 | |||
350 | union squashfs_inode { | ||
351 | struct squashfs_base_inode base; | ||
352 | struct squashfs_dev_inode dev; | ||
353 | struct squashfs_symlink_inode symlink; | ||
354 | struct squashfs_reg_inode reg; | ||
355 | struct squashfs_lreg_inode lreg; | ||
356 | struct squashfs_dir_inode dir; | ||
357 | struct squashfs_ldir_inode ldir; | ||
358 | struct squashfs_ipc_inode ipc; | ||
359 | }; | ||
360 | |||
361 | struct squashfs_dir_entry { | ||
362 | __le16 offset; | ||
363 | __le16 inode_number; | ||
364 | __le16 type; | ||
365 | __le16 size; | ||
366 | char name[0]; | ||
367 | }; | ||
368 | |||
369 | struct squashfs_dir_header { | ||
370 | __le32 count; | ||
371 | __le32 start_block; | ||
372 | __le32 inode_number; | ||
373 | }; | ||
374 | |||
375 | struct squashfs_fragment_entry { | ||
376 | __le64 start_block; | ||
377 | __le32 size; | ||
378 | unsigned int unused; | ||
379 | }; | ||
380 | |||
381 | #endif | ||
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h new file mode 100644 index 000000000000..fbfca30c0c68 --- /dev/null +++ b/fs/squashfs/squashfs_fs_i.h | |||
@@ -0,0 +1,45 @@ | |||
1 | #ifndef SQUASHFS_FS_I | ||
2 | #define SQUASHFS_FS_I | ||
3 | /* | ||
4 | * Squashfs | ||
5 | * | ||
6 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
7 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version 2, | ||
12 | * or (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
22 | * | ||
23 | * squashfs_fs_i.h | ||
24 | */ | ||
25 | |||
26 | struct squashfs_inode_info { | ||
27 | u64 start; | ||
28 | int offset; | ||
29 | union { | ||
30 | struct { | ||
31 | u64 fragment_block; | ||
32 | int fragment_size; | ||
33 | int fragment_offset; | ||
34 | u64 block_list_start; | ||
35 | }; | ||
36 | struct { | ||
37 | u64 dir_idx_start; | ||
38 | int dir_idx_offset; | ||
39 | int dir_idx_cnt; | ||
40 | int parent; | ||
41 | }; | ||
42 | }; | ||
43 | struct inode vfs_inode; | ||
44 | }; | ||
45 | #endif | ||
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h new file mode 100644 index 000000000000..c8c65614dd1c --- /dev/null +++ b/fs/squashfs/squashfs_fs_sb.h | |||
@@ -0,0 +1,76 @@ | |||
1 | #ifndef SQUASHFS_FS_SB | ||
2 | #define SQUASHFS_FS_SB | ||
3 | /* | ||
4 | * Squashfs | ||
5 | * | ||
6 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
7 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version 2, | ||
12 | * or (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
22 | * | ||
23 | * squashfs_fs_sb.h | ||
24 | */ | ||
25 | |||
26 | #include "squashfs_fs.h" | ||
27 | |||
28 | struct squashfs_cache { | ||
29 | char *name; | ||
30 | int entries; | ||
31 | int next_blk; | ||
32 | int num_waiters; | ||
33 | int unused; | ||
34 | int block_size; | ||
35 | int pages; | ||
36 | spinlock_t lock; | ||
37 | wait_queue_head_t wait_queue; | ||
38 | struct squashfs_cache_entry *entry; | ||
39 | }; | ||
40 | |||
41 | struct squashfs_cache_entry { | ||
42 | u64 block; | ||
43 | int length; | ||
44 | int refcount; | ||
45 | u64 next_index; | ||
46 | int pending; | ||
47 | int error; | ||
48 | int num_waiters; | ||
49 | wait_queue_head_t wait_queue; | ||
50 | struct squashfs_cache *cache; | ||
51 | void **data; | ||
52 | }; | ||
53 | |||
54 | struct squashfs_sb_info { | ||
55 | int devblksize; | ||
56 | int devblksize_log2; | ||
57 | struct squashfs_cache *block_cache; | ||
58 | struct squashfs_cache *fragment_cache; | ||
59 | struct squashfs_cache *read_page; | ||
60 | int next_meta_index; | ||
61 | __le64 *id_table; | ||
62 | __le64 *fragment_index; | ||
63 | unsigned int *fragment_index_2; | ||
64 | struct mutex read_data_mutex; | ||
65 | struct mutex meta_index_mutex; | ||
66 | struct meta_index *meta_index; | ||
67 | z_stream stream; | ||
68 | __le64 *inode_lookup_table; | ||
69 | u64 inode_table; | ||
70 | u64 directory_table; | ||
71 | unsigned int block_size; | ||
72 | unsigned short block_log; | ||
73 | long long bytes_used; | ||
74 | unsigned int inodes; | ||
75 | }; | ||
76 | #endif | ||
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c new file mode 100644 index 000000000000..a0466d7467b2 --- /dev/null +++ b/fs/squashfs/super.c | |||
@@ -0,0 +1,440 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * super.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to read the superblock, read and initialise | ||
26 | * in-memory structures at mount time, and all the VFS glue code to register | ||
27 | * the filesystem. | ||
28 | */ | ||
29 | |||
30 | #include <linux/fs.h> | ||
31 | #include <linux/vfs.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/mutex.h> | ||
34 | #include <linux/pagemap.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/zlib.h> | ||
38 | |||
39 | #include "squashfs_fs.h" | ||
40 | #include "squashfs_fs_sb.h" | ||
41 | #include "squashfs_fs_i.h" | ||
42 | #include "squashfs.h" | ||
43 | |||
44 | static struct file_system_type squashfs_fs_type; | ||
45 | static struct super_operations squashfs_super_ops; | ||
46 | |||
47 | static int supported_squashfs_filesystem(short major, short minor, short comp) | ||
48 | { | ||
49 | if (major < SQUASHFS_MAJOR) { | ||
50 | ERROR("Major/Minor mismatch, older Squashfs %d.%d " | ||
51 | "filesystems are unsupported\n", major, minor); | ||
52 | return -EINVAL; | ||
53 | } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { | ||
54 | ERROR("Major/Minor mismatch, trying to mount newer " | ||
55 | "%d.%d filesystem\n", major, minor); | ||
56 | ERROR("Please update your kernel\n"); | ||
57 | return -EINVAL; | ||
58 | } | ||
59 | |||
60 | if (comp != ZLIB_COMPRESSION) | ||
61 | return -EINVAL; | ||
62 | |||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | |||
67 | static int squashfs_fill_super(struct super_block *sb, void *data, int silent) | ||
68 | { | ||
69 | struct squashfs_sb_info *msblk; | ||
70 | struct squashfs_super_block *sblk = NULL; | ||
71 | char b[BDEVNAME_SIZE]; | ||
72 | struct inode *root; | ||
73 | long long root_inode; | ||
74 | unsigned short flags; | ||
75 | unsigned int fragments; | ||
76 | u64 lookup_table_start; | ||
77 | int err; | ||
78 | |||
79 | TRACE("Entered squashfs_fill_superblock\n"); | ||
80 | |||
81 | sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); | ||
82 | if (sb->s_fs_info == NULL) { | ||
83 | ERROR("Failed to allocate squashfs_sb_info\n"); | ||
84 | return -ENOMEM; | ||
85 | } | ||
86 | msblk = sb->s_fs_info; | ||
87 | |||
88 | msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), | ||
89 | GFP_KERNEL); | ||
90 | if (msblk->stream.workspace == NULL) { | ||
91 | ERROR("Failed to allocate zlib workspace\n"); | ||
92 | goto failure; | ||
93 | } | ||
94 | |||
95 | sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); | ||
96 | if (sblk == NULL) { | ||
97 | ERROR("Failed to allocate squashfs_super_block\n"); | ||
98 | goto failure; | ||
99 | } | ||
100 | |||
101 | msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); | ||
102 | msblk->devblksize_log2 = ffz(~msblk->devblksize); | ||
103 | |||
104 | mutex_init(&msblk->read_data_mutex); | ||
105 | mutex_init(&msblk->meta_index_mutex); | ||
106 | |||
107 | /* | ||
108 | * msblk->bytes_used is checked in squashfs_read_table to ensure reads | ||
109 | * are not beyond filesystem end. But as we're using | ||
110 | * squashfs_read_table here to read the superblock (including the value | ||
111 | * of bytes_used) we need to set it to an initial sensible dummy value | ||
112 | */ | ||
113 | msblk->bytes_used = sizeof(*sblk); | ||
114 | err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk)); | ||
115 | |||
116 | if (err < 0) { | ||
117 | ERROR("unable to read squashfs_super_block\n"); | ||
118 | goto failed_mount; | ||
119 | } | ||
120 | |||
121 | /* Check it is a SQUASHFS superblock */ | ||
122 | sb->s_magic = le32_to_cpu(sblk->s_magic); | ||
123 | if (sb->s_magic != SQUASHFS_MAGIC) { | ||
124 | if (!silent) | ||
125 | ERROR("Can't find a SQUASHFS superblock on %s\n", | ||
126 | bdevname(sb->s_bdev, b)); | ||
127 | err = -EINVAL; | ||
128 | goto failed_mount; | ||
129 | } | ||
130 | |||
131 | /* Check the MAJOR & MINOR versions and compression type */ | ||
132 | err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), | ||
133 | le16_to_cpu(sblk->s_minor), | ||
134 | le16_to_cpu(sblk->compression)); | ||
135 | if (err < 0) | ||
136 | goto failed_mount; | ||
137 | |||
138 | err = -EINVAL; | ||
139 | |||
140 | /* | ||
141 | * Check if there's xattrs in the filesystem. These are not | ||
142 | * supported in this version, so warn that they will be ignored. | ||
143 | */ | ||
144 | if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK) | ||
145 | ERROR("Xattrs in filesystem, these will be ignored\n"); | ||
146 | |||
147 | /* Check the filesystem does not extend beyond the end of the | ||
148 | block device */ | ||
149 | msblk->bytes_used = le64_to_cpu(sblk->bytes_used); | ||
150 | if (msblk->bytes_used < 0 || msblk->bytes_used > | ||
151 | i_size_read(sb->s_bdev->bd_inode)) | ||
152 | goto failed_mount; | ||
153 | |||
154 | /* Check block size for sanity */ | ||
155 | msblk->block_size = le32_to_cpu(sblk->block_size); | ||
156 | if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) | ||
157 | goto failed_mount; | ||
158 | |||
159 | msblk->block_log = le16_to_cpu(sblk->block_log); | ||
160 | if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) | ||
161 | goto failed_mount; | ||
162 | |||
163 | /* Check the root inode for sanity */ | ||
164 | root_inode = le64_to_cpu(sblk->root_inode); | ||
165 | if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) | ||
166 | goto failed_mount; | ||
167 | |||
168 | msblk->inode_table = le64_to_cpu(sblk->inode_table_start); | ||
169 | msblk->directory_table = le64_to_cpu(sblk->directory_table_start); | ||
170 | msblk->inodes = le32_to_cpu(sblk->inodes); | ||
171 | flags = le16_to_cpu(sblk->flags); | ||
172 | |||
173 | TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); | ||
174 | TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) | ||
175 | ? "un" : ""); | ||
176 | TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) | ||
177 | ? "un" : ""); | ||
178 | TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); | ||
179 | TRACE("Block size %d\n", msblk->block_size); | ||
180 | TRACE("Number of inodes %d\n", msblk->inodes); | ||
181 | TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); | ||
182 | TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); | ||
183 | TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); | ||
184 | TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); | ||
185 | TRACE("sblk->fragment_table_start %llx\n", | ||
186 | (u64) le64_to_cpu(sblk->fragment_table_start)); | ||
187 | TRACE("sblk->id_table_start %llx\n", | ||
188 | (u64) le64_to_cpu(sblk->id_table_start)); | ||
189 | |||
190 | sb->s_maxbytes = MAX_LFS_FILESIZE; | ||
191 | sb->s_flags |= MS_RDONLY; | ||
192 | sb->s_op = &squashfs_super_ops; | ||
193 | |||
194 | err = -ENOMEM; | ||
195 | |||
196 | msblk->block_cache = squashfs_cache_init("metadata", | ||
197 | SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); | ||
198 | if (msblk->block_cache == NULL) | ||
199 | goto failed_mount; | ||
200 | |||
201 | /* Allocate read_page block */ | ||
202 | msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); | ||
203 | if (msblk->read_page == NULL) { | ||
204 | ERROR("Failed to allocate read_page block\n"); | ||
205 | goto failed_mount; | ||
206 | } | ||
207 | |||
208 | /* Allocate and read id index table */ | ||
209 | msblk->id_table = squashfs_read_id_index_table(sb, | ||
210 | le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); | ||
211 | if (IS_ERR(msblk->id_table)) { | ||
212 | err = PTR_ERR(msblk->id_table); | ||
213 | msblk->id_table = NULL; | ||
214 | goto failed_mount; | ||
215 | } | ||
216 | |||
217 | fragments = le32_to_cpu(sblk->fragments); | ||
218 | if (fragments == 0) | ||
219 | goto allocate_lookup_table; | ||
220 | |||
221 | msblk->fragment_cache = squashfs_cache_init("fragment", | ||
222 | SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); | ||
223 | if (msblk->fragment_cache == NULL) { | ||
224 | err = -ENOMEM; | ||
225 | goto failed_mount; | ||
226 | } | ||
227 | |||
228 | /* Allocate and read fragment index table */ | ||
229 | msblk->fragment_index = squashfs_read_fragment_index_table(sb, | ||
230 | le64_to_cpu(sblk->fragment_table_start), fragments); | ||
231 | if (IS_ERR(msblk->fragment_index)) { | ||
232 | err = PTR_ERR(msblk->fragment_index); | ||
233 | msblk->fragment_index = NULL; | ||
234 | goto failed_mount; | ||
235 | } | ||
236 | |||
237 | allocate_lookup_table: | ||
238 | lookup_table_start = le64_to_cpu(sblk->lookup_table_start); | ||
239 | if (lookup_table_start == SQUASHFS_INVALID_BLK) | ||
240 | goto allocate_root; | ||
241 | |||
242 | /* Allocate and read inode lookup table */ | ||
243 | msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, | ||
244 | lookup_table_start, msblk->inodes); | ||
245 | if (IS_ERR(msblk->inode_lookup_table)) { | ||
246 | err = PTR_ERR(msblk->inode_lookup_table); | ||
247 | msblk->inode_lookup_table = NULL; | ||
248 | goto failed_mount; | ||
249 | } | ||
250 | |||
251 | sb->s_export_op = &squashfs_export_ops; | ||
252 | |||
253 | allocate_root: | ||
254 | root = new_inode(sb); | ||
255 | if (!root) { | ||
256 | err = -ENOMEM; | ||
257 | goto failed_mount; | ||
258 | } | ||
259 | |||
260 | err = squashfs_read_inode(root, root_inode); | ||
261 | if (err) { | ||
262 | iget_failed(root); | ||
263 | goto failed_mount; | ||
264 | } | ||
265 | insert_inode_hash(root); | ||
266 | |||
267 | sb->s_root = d_alloc_root(root); | ||
268 | if (sb->s_root == NULL) { | ||
269 | ERROR("Root inode create failed\n"); | ||
270 | err = -ENOMEM; | ||
271 | iput(root); | ||
272 | goto failed_mount; | ||
273 | } | ||
274 | |||
275 | TRACE("Leaving squashfs_fill_super\n"); | ||
276 | kfree(sblk); | ||
277 | return 0; | ||
278 | |||
279 | failed_mount: | ||
280 | squashfs_cache_delete(msblk->block_cache); | ||
281 | squashfs_cache_delete(msblk->fragment_cache); | ||
282 | squashfs_cache_delete(msblk->read_page); | ||
283 | kfree(msblk->inode_lookup_table); | ||
284 | kfree(msblk->fragment_index); | ||
285 | kfree(msblk->id_table); | ||
286 | kfree(msblk->stream.workspace); | ||
287 | kfree(sb->s_fs_info); | ||
288 | sb->s_fs_info = NULL; | ||
289 | kfree(sblk); | ||
290 | return err; | ||
291 | |||
292 | failure: | ||
293 | kfree(msblk->stream.workspace); | ||
294 | kfree(sb->s_fs_info); | ||
295 | sb->s_fs_info = NULL; | ||
296 | return -ENOMEM; | ||
297 | } | ||
298 | |||
299 | |||
300 | static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) | ||
301 | { | ||
302 | struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; | ||
303 | |||
304 | TRACE("Entered squashfs_statfs\n"); | ||
305 | |||
306 | buf->f_type = SQUASHFS_MAGIC; | ||
307 | buf->f_bsize = msblk->block_size; | ||
308 | buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; | ||
309 | buf->f_bfree = buf->f_bavail = 0; | ||
310 | buf->f_files = msblk->inodes; | ||
311 | buf->f_ffree = 0; | ||
312 | buf->f_namelen = SQUASHFS_NAME_LEN; | ||
313 | |||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | |||
318 | static int squashfs_remount(struct super_block *sb, int *flags, char *data) | ||
319 | { | ||
320 | *flags |= MS_RDONLY; | ||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | |||
325 | static void squashfs_put_super(struct super_block *sb) | ||
326 | { | ||
327 | if (sb->s_fs_info) { | ||
328 | struct squashfs_sb_info *sbi = sb->s_fs_info; | ||
329 | squashfs_cache_delete(sbi->block_cache); | ||
330 | squashfs_cache_delete(sbi->fragment_cache); | ||
331 | squashfs_cache_delete(sbi->read_page); | ||
332 | kfree(sbi->id_table); | ||
333 | kfree(sbi->fragment_index); | ||
334 | kfree(sbi->meta_index); | ||
335 | kfree(sbi->stream.workspace); | ||
336 | kfree(sb->s_fs_info); | ||
337 | sb->s_fs_info = NULL; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | |||
342 | static int squashfs_get_sb(struct file_system_type *fs_type, int flags, | ||
343 | const char *dev_name, void *data, | ||
344 | struct vfsmount *mnt) | ||
345 | { | ||
346 | return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, | ||
347 | mnt); | ||
348 | } | ||
349 | |||
350 | |||
351 | static struct kmem_cache *squashfs_inode_cachep; | ||
352 | |||
353 | |||
354 | static void init_once(void *foo) | ||
355 | { | ||
356 | struct squashfs_inode_info *ei = foo; | ||
357 | |||
358 | inode_init_once(&ei->vfs_inode); | ||
359 | } | ||
360 | |||
361 | |||
362 | static int __init init_inodecache(void) | ||
363 | { | ||
364 | squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", | ||
365 | sizeof(struct squashfs_inode_info), 0, | ||
366 | SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); | ||
367 | |||
368 | return squashfs_inode_cachep ? 0 : -ENOMEM; | ||
369 | } | ||
370 | |||
371 | |||
372 | static void destroy_inodecache(void) | ||
373 | { | ||
374 | kmem_cache_destroy(squashfs_inode_cachep); | ||
375 | } | ||
376 | |||
377 | |||
378 | static int __init init_squashfs_fs(void) | ||
379 | { | ||
380 | int err = init_inodecache(); | ||
381 | |||
382 | if (err) | ||
383 | return err; | ||
384 | |||
385 | err = register_filesystem(&squashfs_fs_type); | ||
386 | if (err) { | ||
387 | destroy_inodecache(); | ||
388 | return err; | ||
389 | } | ||
390 | |||
391 | printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " | ||
392 | "Phillip Lougher\n"); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | |||
398 | static void __exit exit_squashfs_fs(void) | ||
399 | { | ||
400 | unregister_filesystem(&squashfs_fs_type); | ||
401 | destroy_inodecache(); | ||
402 | } | ||
403 | |||
404 | |||
405 | static struct inode *squashfs_alloc_inode(struct super_block *sb) | ||
406 | { | ||
407 | struct squashfs_inode_info *ei = | ||
408 | kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); | ||
409 | |||
410 | return ei ? &ei->vfs_inode : NULL; | ||
411 | } | ||
412 | |||
413 | |||
414 | static void squashfs_destroy_inode(struct inode *inode) | ||
415 | { | ||
416 | kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); | ||
417 | } | ||
418 | |||
419 | |||
420 | static struct file_system_type squashfs_fs_type = { | ||
421 | .owner = THIS_MODULE, | ||
422 | .name = "squashfs", | ||
423 | .get_sb = squashfs_get_sb, | ||
424 | .kill_sb = kill_block_super, | ||
425 | .fs_flags = FS_REQUIRES_DEV | ||
426 | }; | ||
427 | |||
428 | static struct super_operations squashfs_super_ops = { | ||
429 | .alloc_inode = squashfs_alloc_inode, | ||
430 | .destroy_inode = squashfs_destroy_inode, | ||
431 | .statfs = squashfs_statfs, | ||
432 | .put_super = squashfs_put_super, | ||
433 | .remount_fs = squashfs_remount | ||
434 | }; | ||
435 | |||
436 | module_init(init_squashfs_fs); | ||
437 | module_exit(exit_squashfs_fs); | ||
438 | MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); | ||
439 | MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>"); | ||
440 | MODULE_LICENSE("GPL"); | ||
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c new file mode 100644 index 000000000000..83d87880aac8 --- /dev/null +++ b/fs/squashfs/symlink.c | |||
@@ -0,0 +1,118 @@ | |||
1 | /* | ||
2 | * Squashfs - a compressed read only filesystem for Linux | ||
3 | * | ||
4 | * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 | ||
5 | * Phillip Lougher <phillip@lougher.demon.co.uk> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version 2, | ||
10 | * or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
20 | * | ||
21 | * symlink.c | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * This file implements code to handle symbolic links. | ||
26 | * | ||
27 | * The data contents of symbolic links are stored inside the symbolic | ||
28 | * link inode within the inode table. This allows the normally small symbolic | ||
29 | * link to be compressed as part of the inode table, achieving much greater | ||
30 | * compression than if the symbolic link was compressed individually. | ||
31 | */ | ||
32 | |||
33 | #include <linux/fs.h> | ||
34 | #include <linux/vfs.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/string.h> | ||
38 | #include <linux/pagemap.h> | ||
39 | #include <linux/zlib.h> | ||
40 | |||
41 | #include "squashfs_fs.h" | ||
42 | #include "squashfs_fs_sb.h" | ||
43 | #include "squashfs_fs_i.h" | ||
44 | #include "squashfs.h" | ||
45 | |||
46 | static int squashfs_symlink_readpage(struct file *file, struct page *page) | ||
47 | { | ||
48 | struct inode *inode = page->mapping->host; | ||
49 | struct super_block *sb = inode->i_sb; | ||
50 | struct squashfs_sb_info *msblk = sb->s_fs_info; | ||
51 | int index = page->index << PAGE_CACHE_SHIFT; | ||
52 | u64 block = squashfs_i(inode)->start; | ||
53 | int offset = squashfs_i(inode)->offset; | ||
54 | int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); | ||
55 | int bytes, copied; | ||
56 | void *pageaddr; | ||
57 | struct squashfs_cache_entry *entry; | ||
58 | |||
59 | TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " | ||
60 | "%llx, offset %x\n", page->index, block, offset); | ||
61 | |||
62 | /* | ||
63 | * Skip index bytes into symlink metadata. | ||
64 | */ | ||
65 | if (index) { | ||
66 | bytes = squashfs_read_metadata(sb, NULL, &block, &offset, | ||
67 | index); | ||
68 | if (bytes < 0) { | ||
69 | ERROR("Unable to read symlink [%llx:%x]\n", | ||
70 | squashfs_i(inode)->start, | ||
71 | squashfs_i(inode)->offset); | ||
72 | goto error_out; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Read length bytes from symlink metadata. Squashfs_read_metadata | ||
78 | * is not used here because it can sleep and we want to use | ||
79 | * kmap_atomic to map the page. Instead call the underlying | ||
80 | * squashfs_cache_get routine. As length bytes may overlap metadata | ||
81 | * blocks, we may need to call squashfs_cache_get multiple times. | ||
82 | */ | ||
83 | for (bytes = 0; bytes < length; offset = 0, bytes += copied) { | ||
84 | entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); | ||
85 | if (entry->error) { | ||
86 | ERROR("Unable to read symlink [%llx:%x]\n", | ||
87 | squashfs_i(inode)->start, | ||
88 | squashfs_i(inode)->offset); | ||
89 | squashfs_cache_put(entry); | ||
90 | goto error_out; | ||
91 | } | ||
92 | |||
93 | pageaddr = kmap_atomic(page, KM_USER0); | ||
94 | copied = squashfs_copy_data(pageaddr + bytes, entry, offset, | ||
95 | length - bytes); | ||
96 | if (copied == length - bytes) | ||
97 | memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); | ||
98 | else | ||
99 | block = entry->next_index; | ||
100 | kunmap_atomic(pageaddr, KM_USER0); | ||
101 | squashfs_cache_put(entry); | ||
102 | } | ||
103 | |||
104 | flush_dcache_page(page); | ||
105 | SetPageUptodate(page); | ||
106 | unlock_page(page); | ||
107 | return 0; | ||
108 | |||
109 | error_out: | ||
110 | SetPageError(page); | ||
111 | unlock_page(page); | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | |||
116 | const struct address_space_operations squashfs_symlink_aops = { | ||
117 | .readpage = squashfs_symlink_readpage | ||
118 | }; | ||
diff --git a/fs/super.c b/fs/super.c index ddba069d7a99..ed080c417167 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/kobject.h> | 38 | #include <linux/kobject.h> |
39 | #include <linux/mutex.h> | 39 | #include <linux/mutex.h> |
40 | #include <linux/file.h> | 40 | #include <linux/file.h> |
41 | #include <linux/async.h> | ||
41 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
42 | #include "internal.h" | 43 | #include "internal.h" |
43 | 44 | ||
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type) | |||
71 | INIT_HLIST_HEAD(&s->s_anon); | 72 | INIT_HLIST_HEAD(&s->s_anon); |
72 | INIT_LIST_HEAD(&s->s_inodes); | 73 | INIT_LIST_HEAD(&s->s_inodes); |
73 | INIT_LIST_HEAD(&s->s_dentry_lru); | 74 | INIT_LIST_HEAD(&s->s_dentry_lru); |
75 | INIT_LIST_HEAD(&s->s_async_list); | ||
74 | init_rwsem(&s->s_umount); | 76 | init_rwsem(&s->s_umount); |
75 | mutex_init(&s->s_lock); | 77 | mutex_init(&s->s_lock); |
76 | lockdep_set_class(&s->s_umount, &type->s_umount_key); | 78 | lockdep_set_class(&s->s_umount, &type->s_umount_key); |
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb) | |||
289 | { | 291 | { |
290 | const struct super_operations *sop = sb->s_op; | 292 | const struct super_operations *sop = sb->s_op; |
291 | 293 | ||
294 | |||
292 | if (sb->s_root) { | 295 | if (sb->s_root) { |
293 | shrink_dcache_for_umount(sb); | 296 | shrink_dcache_for_umount(sb); |
294 | fsync_super(sb); | 297 | fsync_super(sb); |
295 | lock_super(sb); | 298 | lock_super(sb); |
296 | sb->s_flags &= ~MS_ACTIVE; | 299 | sb->s_flags &= ~MS_ACTIVE; |
300 | |||
301 | /* | ||
302 | * wait for asynchronous fs operations to finish before going further | ||
303 | */ | ||
304 | async_synchronize_full_special(&sb->s_async_list); | ||
305 | |||
297 | /* bad name - it should be evict_inodes() */ | 306 | /* bad name - it should be evict_inodes() */ |
298 | invalidate_inodes(sb); | 307 | invalidate_inodes(sb); |
299 | lock_kernel(); | 308 | lock_kernel(); |
@@ -461,6 +470,7 @@ restart: | |||
461 | sb->s_count++; | 470 | sb->s_count++; |
462 | spin_unlock(&sb_lock); | 471 | spin_unlock(&sb_lock); |
463 | down_read(&sb->s_umount); | 472 | down_read(&sb->s_umount); |
473 | async_synchronize_full_special(&sb->s_async_list); | ||
464 | if (sb->s_root && (wait || sb->s_dirt)) | 474 | if (sb->s_root && (wait || sb->s_dirt)) |
465 | sb->s_op->sync_fs(sb, wait); | 475 | sb->s_op->sync_fs(sb, wait); |
466 | up_read(&sb->s_umount); | 476 | up_read(&sb->s_umount); |
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type, | |||
800 | } | 810 | } |
801 | 811 | ||
802 | s->s_flags |= MS_ACTIVE; | 812 | s->s_flags |= MS_ACTIVE; |
813 | bdev->bd_super = s; | ||
803 | } | 814 | } |
804 | 815 | ||
805 | return simple_set_mnt(mnt, s); | 816 | return simple_set_mnt(mnt, s); |
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb) | |||
819 | struct block_device *bdev = sb->s_bdev; | 830 | struct block_device *bdev = sb->s_bdev; |
820 | fmode_t mode = sb->s_mode; | 831 | fmode_t mode = sb->s_mode; |
821 | 832 | ||
833 | bdev->bd_super = 0; | ||
822 | generic_shutdown_super(sb); | 834 | generic_shutdown_super(sb); |
823 | sync_blockdev(bdev); | 835 | sync_blockdev(bdev); |
824 | close_bdev_exclusive(bdev, mode); | 836 | close_bdev_exclusive(bdev, mode); |
@@ -295,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, | |||
295 | 295 | ||
296 | if (flags & SYNC_FILE_RANGE_WRITE) { | 296 | if (flags & SYNC_FILE_RANGE_WRITE) { |
297 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | 297 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, |
298 | WB_SYNC_NONE); | 298 | WB_SYNC_ALL); |
299 | if (ret < 0) | 299 | if (ret < 0) |
300 | goto out; | 300 | goto out; |
301 | } | 301 | } |
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 91ceeda7e5bf..e35b54d5059d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig | |||
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB | |||
40 | depends on UBIFS_FS | 40 | depends on UBIFS_FS |
41 | default y | 41 | default y |
42 | help | 42 | help |
43 | Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. | 43 | Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. |
44 | 44 | ||
45 | # Debugging-related stuff | 45 | # Debugging-related stuff |
46 | config UBIFS_FS_DEBUG | 46 | config UBIFS_FS_DEBUG |
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 0e5e54d82924..175f9c590b77 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c | |||
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c) | |||
142 | * | 142 | * |
143 | * This function is called when an operation cannot be budgeted because there | 143 | * This function is called when an operation cannot be budgeted because there |
144 | * is supposedly no free space. But in most cases there is some free space: | 144 | * is supposedly no free space. But in most cases there is some free space: |
145 | * o budgeting is pessimistic, so it always budgets more then it is actually | 145 | * o budgeting is pessimistic, so it always budgets more than it is actually |
146 | * needed, so shrinking the liability is one way to make free space - the | 146 | * needed, so shrinking the liability is one way to make free space - the |
147 | * cached data will take less space then it was budgeted for; | 147 | * cached data will take less space then it was budgeted for; |
148 | * o GC may turn some dark space into free space (budgeting treats dark space | 148 | * o GC may turn some dark space into free space (budgeting treats dark space |
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) | |||
606 | * @c: UBIFS file-system description object | 606 | * @c: UBIFS file-system description object |
607 | * | 607 | * |
608 | * This function converts budget which was allocated for a new page of data to | 608 | * This function converts budget which was allocated for a new page of data to |
609 | * the budget of changing an existing page of data. The latter is smaller then | 609 | * the budget of changing an existing page of data. The latter is smaller than |
610 | * the former, so this function only does simple re-calculation and does not | 610 | * the former, so this function only does simple re-calculation and does not |
611 | * involve any write-back. | 611 | * involve any write-back. |
612 | */ | 612 | */ |
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 0bef6501d58a..9832f9abe28e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c | |||
@@ -45,7 +45,7 @@ | |||
45 | #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ | 45 | #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * GC may need to move more then one LEB to make progress. The below constants | 48 | * GC may need to move more than one LEB to make progress. The below constants |
49 | * define "soft" and "hard" limits on the number of LEBs the garbage collector | 49 | * define "soft" and "hard" limits on the number of LEBs the garbage collector |
50 | * may move. | 50 | * may move. |
51 | */ | 51 | */ |
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 10ae25b7d1db..9b7c54e0cd2a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c | |||
@@ -191,7 +191,7 @@ again: | |||
191 | if (wbuf->lnum != -1 && avail >= len) { | 191 | if (wbuf->lnum != -1 && avail >= len) { |
192 | /* | 192 | /* |
193 | * Someone else has switched the journal head and we have | 193 | * Someone else has switched the journal head and we have |
194 | * enough space now. This happens when more then one process is | 194 | * enough space now. This happens when more than one process is |
195 | * trying to write to the same journal head at the same time. | 195 | * trying to write to the same journal head at the same time. |
196 | */ | 196 | */ |
197 | dbg_jnl("return LEB %d back, already have LEB %d:%d", | 197 | dbg_jnl("return LEB %d back, already have LEB %d:%d", |
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f248533841a2..e7bab52a1410 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c | |||
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) | |||
151 | * @contention: if any contention, this is set to %1 | 151 | * @contention: if any contention, this is set to %1 |
152 | * | 152 | * |
153 | * This function walks the list of mounted UBIFS file-systems and frees clean | 153 | * This function walks the list of mounted UBIFS file-systems and frees clean |
154 | * znodes which are older then @age, until at least @nr znodes are freed. | 154 | * znodes which are older than @age, until at least @nr znodes are freed. |
155 | * Returns the number of freed znodes. | 155 | * Returns the number of freed znodes. |
156 | */ | 156 | */ |
157 | static int shrink_tnc_trees(int nr, int age, int *contention) | 157 | static int shrink_tnc_trees(int nr, int age, int *contention) |
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0d7564b95f8e..89556ee72518 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c | |||
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) | |||
432 | int i, err; | 432 | int i, err; |
433 | struct ubifs_info *c = sb->s_fs_info; | 433 | struct ubifs_info *c = sb->s_fs_info; |
434 | struct writeback_control wbc = { | 434 | struct writeback_control wbc = { |
435 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, | 435 | .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, |
436 | .range_start = 0, | 436 | .range_start = 0, |
437 | .range_end = LLONG_MAX, | 437 | .range_end = LLONG_MAX, |
438 | .nr_to_write = LONG_MAX, | 438 | .nr_to_write = LONG_MAX, |
439 | }; | 439 | }; |
440 | 440 | ||
441 | /* | ||
442 | * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an | ||
443 | * advisory thing to help the file system shove lots of data into the | ||
444 | * queues. If some gets missed then it'll be picked up on the second | ||
445 | * '->sync_fs()' call, with non-zero @wait. | ||
446 | */ | ||
447 | |||
441 | if (sb->s_flags & MS_RDONLY) | 448 | if (sb->s_flags & MS_RDONLY) |
442 | return 0; | 449 | return 0; |
443 | 450 | ||
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 67205f6198ba..e5be1e0be802 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c | |||
@@ -1546,21 +1546,6 @@ xfs_file_ioctl( | |||
1546 | return -error; | 1546 | return -error; |
1547 | } | 1547 | } |
1548 | 1548 | ||
1549 | case XFS_IOC_FREEZE: | ||
1550 | if (!capable(CAP_SYS_ADMIN)) | ||
1551 | return -EPERM; | ||
1552 | |||
1553 | if (inode->i_sb->s_frozen == SB_UNFROZEN) | ||
1554 | freeze_bdev(inode->i_sb->s_bdev); | ||
1555 | return 0; | ||
1556 | |||
1557 | case XFS_IOC_THAW: | ||
1558 | if (!capable(CAP_SYS_ADMIN)) | ||
1559 | return -EPERM; | ||
1560 | if (inode->i_sb->s_frozen != SB_UNFROZEN) | ||
1561 | thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); | ||
1562 | return 0; | ||
1563 | |||
1564 | case XFS_IOC_GOINGDOWN: { | 1549 | case XFS_IOC_GOINGDOWN: { |
1565 | __uint32_t in; | 1550 | __uint32_t in; |
1566 | 1551 | ||
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0504cece9f66..50903ad3182e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c | |||
@@ -632,8 +632,6 @@ xfs_file_compat_ioctl( | |||
632 | case XFS_IOC_SET_RESBLKS: | 632 | case XFS_IOC_SET_RESBLKS: |
633 | case XFS_IOC_GET_RESBLKS: | 633 | case XFS_IOC_GET_RESBLKS: |
634 | case XFS_IOC_FSGROWFSLOG: | 634 | case XFS_IOC_FSGROWFSLOG: |
635 | case XFS_IOC_FREEZE: | ||
636 | case XFS_IOC_THAW: | ||
637 | case XFS_IOC_GOINGDOWN: | 635 | case XFS_IOC_GOINGDOWN: |
638 | case XFS_IOC_ERROR_INJECTION: | 636 | case XFS_IOC_ERROR_INJECTION: |
639 | case XFS_IOC_ERROR_CLEARALL: | 637 | case XFS_IOC_ERROR_CLEARALL: |
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 36f6cc703ef2..95a971080368 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c | |||
@@ -1269,14 +1269,14 @@ xfs_fs_remount( | |||
1269 | * need to take care of the metadata. Once that's done write a dummy | 1269 | * need to take care of the metadata. Once that's done write a dummy |
1270 | * record to dirty the log in case of a crash while frozen. | 1270 | * record to dirty the log in case of a crash while frozen. |
1271 | */ | 1271 | */ |
1272 | STATIC void | 1272 | STATIC int |
1273 | xfs_fs_lockfs( | 1273 | xfs_fs_freeze( |
1274 | struct super_block *sb) | 1274 | struct super_block *sb) |
1275 | { | 1275 | { |
1276 | struct xfs_mount *mp = XFS_M(sb); | 1276 | struct xfs_mount *mp = XFS_M(sb); |
1277 | 1277 | ||
1278 | xfs_quiesce_attr(mp); | 1278 | xfs_quiesce_attr(mp); |
1279 | xfs_fs_log_dummy(mp); | 1279 | return -xfs_fs_log_dummy(mp); |
1280 | } | 1280 | } |
1281 | 1281 | ||
1282 | STATIC int | 1282 | STATIC int |
@@ -1348,7 +1348,7 @@ xfs_finish_flags( | |||
1348 | { | 1348 | { |
1349 | int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); | 1349 | int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); |
1350 | 1350 | ||
1351 | /* Fail a mount where the logbuf is smaller then the log stripe */ | 1351 | /* Fail a mount where the logbuf is smaller than the log stripe */ |
1352 | if (xfs_sb_version_haslogv2(&mp->m_sb)) { | 1352 | if (xfs_sb_version_haslogv2(&mp->m_sb)) { |
1353 | if (mp->m_logbsize <= 0 && | 1353 | if (mp->m_logbsize <= 0 && |
1354 | mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { | 1354 | mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { |
@@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = { | |||
1557 | .put_super = xfs_fs_put_super, | 1557 | .put_super = xfs_fs_put_super, |
1558 | .write_super = xfs_fs_write_super, | 1558 | .write_super = xfs_fs_write_super, |
1559 | .sync_fs = xfs_fs_sync_super, | 1559 | .sync_fs = xfs_fs_sync_super, |
1560 | .write_super_lockfs = xfs_fs_lockfs, | 1560 | .freeze_fs = xfs_fs_freeze, |
1561 | .statfs = xfs_fs_statfs, | 1561 | .statfs = xfs_fs_statfs, |
1562 | .remount_fs = xfs_fs_remount, | 1562 | .remount_fs = xfs_fs_remount, |
1563 | .show_options = xfs_fs_show_options, | 1563 | .show_options = xfs_fs_show_options, |
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 589c41c38446..f7c06fac8229 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h | |||
@@ -465,8 +465,8 @@ typedef struct xfs_handle { | |||
465 | #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) | 465 | #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) |
466 | #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) | 466 | #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) |
467 | /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ | 467 | /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ |
468 | #define XFS_IOC_FREEZE _IOWR('X', 119, int) | 468 | /* XFS_IOC_FREEZE -- FIFREEZE 119 */ |
469 | #define XFS_IOC_THAW _IOWR('X', 120, int) | 469 | /* XFS_IOC_THAW -- FITHAW 120 */ |
470 | #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) | 470 | #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) |
471 | #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) | 471 | #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) |
472 | #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) | 472 | #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) |
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 852b6d32e8d0..680d0e0ec932 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c | |||
@@ -595,17 +595,19 @@ out: | |||
595 | return 0; | 595 | return 0; |
596 | } | 596 | } |
597 | 597 | ||
598 | void | 598 | int |
599 | xfs_fs_log_dummy( | 599 | xfs_fs_log_dummy( |
600 | xfs_mount_t *mp) | 600 | xfs_mount_t *mp) |
601 | { | 601 | { |
602 | xfs_trans_t *tp; | 602 | xfs_trans_t *tp; |
603 | xfs_inode_t *ip; | 603 | xfs_inode_t *ip; |
604 | int error; | ||
604 | 605 | ||
605 | tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); | 606 | tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); |
606 | if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { | 607 | error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); |
608 | if (error) { | ||
607 | xfs_trans_cancel(tp, 0); | 609 | xfs_trans_cancel(tp, 0); |
608 | return; | 610 | return error; |
609 | } | 611 | } |
610 | 612 | ||
611 | ip = mp->m_rootip; | 613 | ip = mp->m_rootip; |
@@ -615,9 +617,10 @@ xfs_fs_log_dummy( | |||
615 | xfs_trans_ihold(tp, ip); | 617 | xfs_trans_ihold(tp, ip); |
616 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | 618 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
617 | xfs_trans_set_sync(tp); | 619 | xfs_trans_set_sync(tp); |
618 | xfs_trans_commit(tp, 0); | 620 | error = xfs_trans_commit(tp, 0); |
619 | 621 | ||
620 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 622 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
623 | return error; | ||
621 | } | 624 | } |
622 | 625 | ||
623 | int | 626 | int |
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 300d0c9d61ad..88435e0a77c9 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h | |||
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); | |||
25 | extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, | 25 | extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, |
26 | xfs_fsop_resblks_t *outval); | 26 | xfs_fsop_resblks_t *outval); |
27 | extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); | 27 | extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); |
28 | extern void xfs_fs_log_dummy(xfs_mount_t *mp); | 28 | extern int xfs_fs_log_dummy(xfs_mount_t *mp); |
29 | 29 | ||
30 | #endif /* __XFS_FSOPS_H__ */ | 30 | #endif /* __XFS_FSOPS_H__ */ |