aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c21
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c371
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c167
-rw-r--r--fs/btrfs/ctree.h90
-rw-r--r--fs/btrfs/delayed-ref.c6
-rw-r--r--fs/btrfs/dir-item.c45
-rw-r--r--fs/btrfs/disk-io.c640
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c566
-rw-r--r--fs/btrfs/extent_io.c313
-rw-r--r--fs/btrfs/extent_io.h22
-rw-r--r--fs/btrfs/extent_map.c8
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c10
-rw-r--r--fs/btrfs/file.c602
-rw-r--r--fs/btrfs/free-space-cache.c863
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c936
-rw-r--r--fs/btrfs/ioctl.c362
-rw-r--r--fs/btrfs/ioctl.h12
-rw-r--r--fs/btrfs/lzo.c427
-rw-r--r--fs/btrfs/ordered-data.c28
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/relocation.c53
-rw-r--r--fs/btrfs/root-tree.c24
-rw-r--r--fs/btrfs/super.c359
-rw-r--r--fs/btrfs/transaction.c80
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c92
-rw-r--r--fs/btrfs/volumes.c895
-rw-r--r--fs/btrfs/volumes.h39
-rw-r--r--fs/btrfs/xattr.c59
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c372
41 files changed, 5444 insertions, 2135 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
37 char *value = NULL; 37 char *value = NULL;
38 struct posix_acl *acl; 38 struct posix_acl *acl;
39 39
40 if (!IS_POSIXACL(inode))
41 return NULL;
42
40 acl = get_cached_acl(inode, type); 43 acl = get_cached_acl(inode, type);
41 if (acl != ACL_NOT_CACHED) 44 if (acl != ACL_NOT_CACHED)
42 return acl; 45 return acl;
@@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 63 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 64 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 65 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 66 if (IS_ERR(acl)) {
67 kfree(value);
64 return acl; 68 return acl;
69 }
65 set_cached_acl(inode, type, acl); 70 set_cached_acl(inode, type, acl);
66 } 71 }
67 kfree(value); 72 kfree(value);
@@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
82 struct posix_acl *acl; 87 struct posix_acl *acl;
83 int ret = 0; 88 int ret = 0;
84 89
90 if (!IS_POSIXACL(dentry->d_inode))
91 return -EOPNOTSUPP;
92
85 acl = btrfs_get_acl(dentry->d_inode, type); 93 acl = btrfs_get_acl(dentry->d_inode, type);
86 94
87 if (IS_ERR(acl)) 95 if (IS_ERR(acl))
@@ -162,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
162 int ret; 170 int ret;
163 struct posix_acl *acl = NULL; 171 struct posix_acl *acl = NULL;
164 172
165 if (!is_owner_or_cap(dentry->d_inode)) 173 if (!inode_owner_or_capable(dentry->d_inode))
166 return -EPERM; 174 return -EPERM;
167 175
168 if (!IS_POSIXACL(dentry->d_inode)) 176 if (!IS_POSIXACL(dentry->d_inode))
@@ -170,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
170 178
171 if (value) { 179 if (value) {
172 acl = posix_acl_from_xattr(value, size); 180 acl = posix_acl_from_xattr(value, size);
173 if (acl == NULL) { 181 if (acl) {
174 value = NULL; 182 ret = posix_acl_valid(acl);
175 size = 0; 183 if (ret)
184 goto out;
176 } else if (IS_ERR(acl)) { 185 } else if (IS_ERR(acl)) {
177 return PTR_ERR(acl); 186 return PTR_ERR(acl);
178 } 187 }
179 } 188 }
180 189
181 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); 190 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
182 191out:
183 posix_acl_release(acl); 192 posix_acl_release(acl);
184 193
185 return ret; 194 return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..57c3bb2884ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -136,9 +136,8 @@ struct btrfs_inode {
136 * items we think we'll end up using, and reserved_extents is the number 136 * items we think we'll end up using, and reserved_extents is the number
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents; 139 atomic_t outstanding_extents;
141 int reserved_extents; 140 atomic_t reserved_extents;
142 141
143 /* 142 /*
144 * ordered_data_close is set by truncate when a file that used 143 * ordered_data_close is set by truncate when a file that used
@@ -157,7 +156,7 @@ struct btrfs_inode {
157 /* 156 /*
158 * always compress this one file 157 * always compress this one file
159 */ 158 */
160 unsigned force_compress:1; 159 unsigned force_compress:4;
161 160
162 struct inode vfs_inode; 161 struct inode vfs_inode;
163}; 162};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..41d1d7c70e29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
173 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
174 * the decompression. 177 * the decompression.
175 */ 178 */
176 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
177 cb->start, 180 cb->compressed_pages,
178 cb->orig_bio->bi_io_vec, 181 cb->start,
179 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
180 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
181csum_failed: 185csum_failed:
182 if (ret) 186 if (ret)
183 cb->errors = 1; 187 cb->errors = 1;
@@ -336,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
336 340
337 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 341 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
338 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 342 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
343 if (!cb)
344 return -ENOMEM;
339 atomic_set(&cb->pending_bios, 0); 345 atomic_set(&cb->pending_bios, 0);
340 cb->errors = 0; 346 cb->errors = 0;
341 cb->inode = inode; 347 cb->inode = inode;
@@ -350,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
350 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 356 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
351 357
352 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 358 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
359 if(!bio) {
360 kfree(cb);
361 return -ENOMEM;
362 }
353 bio->bi_private = cb; 363 bio->bi_private = cb;
354 bio->bi_end_io = end_compressed_bio_write; 364 bio->bi_end_io = end_compressed_bio_write;
355 atomic_inc(&cb->pending_bios); 365 atomic_inc(&cb->pending_bios);
@@ -558,7 +568,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
558 u64 em_len; 568 u64 em_len;
559 u64 em_start; 569 u64 em_start;
560 struct extent_map *em; 570 struct extent_map *em;
561 int ret; 571 int ret = -ENOMEM;
562 u32 *sums; 572 u32 *sums;
563 573
564 tree = &BTRFS_I(inode)->io_tree; 574 tree = &BTRFS_I(inode)->io_tree;
@@ -573,6 +583,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
573 583
574 compressed_len = em->block_len; 584 compressed_len = em->block_len;
575 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 585 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
586 if (!cb)
587 goto out;
588
576 atomic_set(&cb->pending_bios, 0); 589 atomic_set(&cb->pending_bios, 0);
577 cb->errors = 0; 590 cb->errors = 0;
578 cb->inode = inode; 591 cb->inode = inode;
@@ -588,17 +601,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
588 601
589 cb->len = uncompressed_len; 602 cb->len = uncompressed_len;
590 cb->compressed_len = compressed_len; 603 cb->compressed_len = compressed_len;
604 cb->compress_type = extent_compress_type(bio_flags);
591 cb->orig_bio = bio; 605 cb->orig_bio = bio;
592 606
593 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 607 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
594 PAGE_CACHE_SIZE; 608 PAGE_CACHE_SIZE;
595 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, 609 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
596 GFP_NOFS); 610 GFP_NOFS);
611 if (!cb->compressed_pages)
612 goto fail1;
613
597 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 614 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
598 615
599 for (page_index = 0; page_index < nr_pages; page_index++) { 616 for (page_index = 0; page_index < nr_pages; page_index++) {
600 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | 617 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
601 __GFP_HIGHMEM); 618 __GFP_HIGHMEM);
619 if (!cb->compressed_pages[page_index])
620 goto fail2;
602 } 621 }
603 cb->nr_pages = nr_pages; 622 cb->nr_pages = nr_pages;
604 623
@@ -609,6 +628,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
609 cb->len = uncompressed_len; 628 cb->len = uncompressed_len;
610 629
611 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 630 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
631 if (!comp_bio)
632 goto fail2;
612 comp_bio->bi_private = cb; 633 comp_bio->bi_private = cb;
613 comp_bio->bi_end_io = end_compressed_bio_read; 634 comp_bio->bi_end_io = end_compressed_bio_read;
614 atomic_inc(&cb->pending_bios); 635 atomic_inc(&cb->pending_bios);
@@ -642,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
642 atomic_inc(&cb->pending_bios); 663 atomic_inc(&cb->pending_bios);
643 664
644 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 665 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
645 btrfs_lookup_bio_sums(root, inode, comp_bio, 666 ret = btrfs_lookup_bio_sums(root, inode,
646 sums); 667 comp_bio, sums);
668 BUG_ON(ret);
647 } 669 }
648 sums += (comp_bio->bi_size + root->sectorsize - 1) / 670 sums += (comp_bio->bi_size + root->sectorsize - 1) /
649 root->sectorsize; 671 root->sectorsize;
@@ -668,12 +690,339 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
668 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 690 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
669 BUG_ON(ret); 691 BUG_ON(ret);
670 692
671 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 693 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
672 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 694 ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
695 BUG_ON(ret);
696 }
673 697
674 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 698 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
675 BUG_ON(ret); 699 BUG_ON(ret);
676 700
677 bio_put(comp_bio); 701 bio_put(comp_bio);
678 return 0; 702 return 0;
703
704fail2:
705 for (page_index = 0; page_index < nr_pages; page_index++)
706 free_page((unsigned long)cb->compressed_pages[page_index]);
707
708 kfree(cb->compressed_pages);
709fail1:
710 kfree(cb);
711out:
712 free_extent_map(em);
713 return ret;
714}
715
716static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
717static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
718static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
719static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
720static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
721
722struct btrfs_compress_op *btrfs_compress_op[] = {
723 &btrfs_zlib_compress,
724 &btrfs_lzo_compress,
725};
726
727int __init btrfs_init_compress(void)
728{
729 int i;
730
731 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
732 INIT_LIST_HEAD(&comp_idle_workspace[i]);
733 spin_lock_init(&comp_workspace_lock[i]);
734 atomic_set(&comp_alloc_workspace[i], 0);
735 init_waitqueue_head(&comp_workspace_wait[i]);
736 }
737 return 0;
738}
739
740/*
741 * this finds an available workspace or allocates a new one
742 * ERR_PTR is returned if things go bad.
743 */
744static struct list_head *find_workspace(int type)
745{
746 struct list_head *workspace;
747 int cpus = num_online_cpus();
748 int idx = type - 1;
749
750 struct list_head *idle_workspace = &comp_idle_workspace[idx];
751 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
752 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
753 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
754 int *num_workspace = &comp_num_workspace[idx];
755again:
756 spin_lock(workspace_lock);
757 if (!list_empty(idle_workspace)) {
758 workspace = idle_workspace->next;
759 list_del(workspace);
760 (*num_workspace)--;
761 spin_unlock(workspace_lock);
762 return workspace;
763
764 }
765 if (atomic_read(alloc_workspace) > cpus) {
766 DEFINE_WAIT(wait);
767
768 spin_unlock(workspace_lock);
769 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
770 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
771 schedule();
772 finish_wait(workspace_wait, &wait);
773 goto again;
774 }
775 atomic_inc(alloc_workspace);
776 spin_unlock(workspace_lock);
777
778 workspace = btrfs_compress_op[idx]->alloc_workspace();
779 if (IS_ERR(workspace)) {
780 atomic_dec(alloc_workspace);
781 wake_up(workspace_wait);
782 }
783 return workspace;
784}
785
786/*
787 * put a workspace struct back on the list or free it if we have enough
788 * idle ones sitting around
789 */
790static void free_workspace(int type, struct list_head *workspace)
791{
792 int idx = type - 1;
793 struct list_head *idle_workspace = &comp_idle_workspace[idx];
794 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
795 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
796 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
797 int *num_workspace = &comp_num_workspace[idx];
798
799 spin_lock(workspace_lock);
800 if (*num_workspace < num_online_cpus()) {
801 list_add_tail(workspace, idle_workspace);
802 (*num_workspace)++;
803 spin_unlock(workspace_lock);
804 goto wake;
805 }
806 spin_unlock(workspace_lock);
807
808 btrfs_compress_op[idx]->free_workspace(workspace);
809 atomic_dec(alloc_workspace);
810wake:
811 if (waitqueue_active(workspace_wait))
812 wake_up(workspace_wait);
813}
814
815/*
816 * cleanup function for module exit
817 */
818static void free_workspaces(void)
819{
820 struct list_head *workspace;
821 int i;
822
823 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
824 while (!list_empty(&comp_idle_workspace[i])) {
825 workspace = comp_idle_workspace[i].next;
826 list_del(workspace);
827 btrfs_compress_op[i]->free_workspace(workspace);
828 atomic_dec(&comp_alloc_workspace[i]);
829 }
830 }
831}
832
833/*
834 * given an address space and start/len, compress the bytes.
835 *
836 * pages are allocated to hold the compressed result and stored
837 * in 'pages'
838 *
839 * out_pages is used to return the number of pages allocated. There
840 * may be pages allocated even if we return an error
841 *
842 * total_in is used to return the number of bytes actually read. It
843 * may be smaller then len if we had to exit early because we
844 * ran out of room in the pages array or because we cross the
845 * max_out threshold.
846 *
847 * total_out is used to return the total number of compressed bytes
848 *
849 * max_out tells us the max number of bytes that we're allowed to
850 * stuff into pages
851 */
852int btrfs_compress_pages(int type, struct address_space *mapping,
853 u64 start, unsigned long len,
854 struct page **pages,
855 unsigned long nr_dest_pages,
856 unsigned long *out_pages,
857 unsigned long *total_in,
858 unsigned long *total_out,
859 unsigned long max_out)
860{
861 struct list_head *workspace;
862 int ret;
863
864 workspace = find_workspace(type);
865 if (IS_ERR(workspace))
866 return -1;
867
868 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
869 start, len, pages,
870 nr_dest_pages, out_pages,
871 total_in, total_out,
872 max_out);
873 free_workspace(type, workspace);
874 return ret;
875}
876
877/*
878 * pages_in is an array of pages with compressed data.
879 *
880 * disk_start is the starting logical offset of this array in the file
881 *
882 * bvec is a bio_vec of pages from the file that we want to decompress into
883 *
884 * vcnt is the count of pages in the biovec
885 *
886 * srclen is the number of bytes in pages_in
887 *
888 * The basic idea is that we have a bio that was created by readpages.
889 * The pages in the bio are for the uncompressed data, and they may not
890 * be contiguous. They all correspond to the range of bytes covered by
891 * the compressed extent.
892 */
893int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
894 struct bio_vec *bvec, int vcnt, size_t srclen)
895{
896 struct list_head *workspace;
897 int ret;
898
899 workspace = find_workspace(type);
900 if (IS_ERR(workspace))
901 return -ENOMEM;
902
903 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
904 disk_start,
905 bvec, vcnt, srclen);
906 free_workspace(type, workspace);
907 return ret;
908}
909
910/*
911 * a less complex decompression routine. Our compressed data fits in a
912 * single page, and we want to read a single page out of it.
913 * start_byte tells us the offset into the compressed data we're interested in
914 */
915int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
916 unsigned long start_byte, size_t srclen, size_t destlen)
917{
918 struct list_head *workspace;
919 int ret;
920
921 workspace = find_workspace(type);
922 if (IS_ERR(workspace))
923 return -ENOMEM;
924
925 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
926 dest_page, start_byte,
927 srclen, destlen);
928
929 free_workspace(type, workspace);
930 return ret;
931}
932
933void btrfs_exit_compress(void)
934{
935 free_workspaces();
936}
937
938/*
939 * Copy uncompressed data from working buffer to pages.
940 *
941 * buf_start is the byte offset we're of the start of our workspace buffer.
942 *
943 * total_out is the last byte of the buffer
944 */
945int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
946 unsigned long total_out, u64 disk_start,
947 struct bio_vec *bvec, int vcnt,
948 unsigned long *page_index,
949 unsigned long *pg_offset)
950{
951 unsigned long buf_offset;
952 unsigned long current_buf_start;
953 unsigned long start_byte;
954 unsigned long working_bytes = total_out - buf_start;
955 unsigned long bytes;
956 char *kaddr;
957 struct page *page_out = bvec[*page_index].bv_page;
958
959 /*
960 * start byte is the first byte of the page we're currently
961 * copying into relative to the start of the compressed data.
962 */
963 start_byte = page_offset(page_out) - disk_start;
964
965 /* we haven't yet hit data corresponding to this page */
966 if (total_out <= start_byte)
967 return 1;
968
969 /*
970 * the start of the data we care about is offset into
971 * the middle of our working buffer
972 */
973 if (total_out > start_byte && buf_start < start_byte) {
974 buf_offset = start_byte - buf_start;
975 working_bytes -= buf_offset;
976 } else {
977 buf_offset = 0;
978 }
979 current_buf_start = buf_start;
980
981 /* copy bytes from the working buffer into the pages */
982 while (working_bytes > 0) {
983 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
984 PAGE_CACHE_SIZE - buf_offset);
985 bytes = min(bytes, working_bytes);
986 kaddr = kmap_atomic(page_out, KM_USER0);
987 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
988 kunmap_atomic(kaddr, KM_USER0);
989 flush_dcache_page(page_out);
990
991 *pg_offset += bytes;
992 buf_offset += bytes;
993 working_bytes -= bytes;
994 current_buf_start += bytes;
995
996 /* check if we need to pick another page */
997 if (*pg_offset == PAGE_CACHE_SIZE) {
998 (*page_index)++;
999 if (*page_index >= vcnt)
1000 return 0;
1001
1002 page_out = bvec[*page_index].bv_page;
1003 *pg_offset = 0;
1004 start_byte = page_offset(page_out) - disk_start;
1005
1006 /*
1007 * make sure our new page is covered by this
1008 * working buffer
1009 */
1010 if (total_out <= start_byte)
1011 return 1;
1012
1013 /*
1014 * the next page in the biovec might not be adjacent
1015 * to the last page, but it might still be found
1016 * inside this working buffer. bump our offset pointer
1017 */
1018 if (total_out > start_byte &&
1019 current_buf_start < start_byte) {
1020 buf_offset = start_byte - buf_start;
1021 working_bytes = total_out - start_byte;
1022 current_buf_start = buf_start + buf_offset;
1023 }
1024 }
1025 }
1026
1027 return 1;
679} 1028}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..84d7ca1fe0ba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -145,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
145struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 147struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
146{ 148{
147 struct extent_buffer *eb; 149 struct extent_buffer *eb;
148 spin_lock(&root->node_lock); 150
149 eb = root->node; 151 rcu_read_lock();
152 eb = rcu_dereference(root->node);
150 extent_buffer_get(eb); 153 extent_buffer_get(eb);
151 spin_unlock(&root->node_lock); 154 rcu_read_unlock();
152 return eb; 155 return eb;
153} 156}
154 157
@@ -163,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
163 while (1) { 166 while (1) {
164 eb = btrfs_root_node(root); 167 eb = btrfs_root_node(root);
165 btrfs_tree_lock(eb); 168 btrfs_tree_lock(eb);
166 169 if (eb == root->node)
167 spin_lock(&root->node_lock);
168 if (eb == root->node) {
169 spin_unlock(&root->node_lock);
170 break; 170 break;
171 }
172 spin_unlock(&root->node_lock);
173
174 btrfs_tree_unlock(eb); 171 btrfs_tree_unlock(eb);
175 free_extent_buffer(eb); 172 free_extent_buffer(eb);
176 } 173 }
@@ -456,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
456 else 453 else
457 parent_start = 0; 454 parent_start = 0;
458 455
459 spin_lock(&root->node_lock);
460 root->node = cow;
461 extent_buffer_get(cow); 456 extent_buffer_get(cow);
462 spin_unlock(&root->node_lock); 457 rcu_assign_pointer(root->node, cow);
463 458
464 btrfs_free_tree_block(trans, root, buf, parent_start, 459 btrfs_free_tree_block(trans, root, buf, parent_start,
465 last_ref); 460 last_ref);
@@ -540,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
540 535
541 ret = __btrfs_cow_block(trans, root, buf, parent, 536 ret = __btrfs_cow_block(trans, root, buf, parent,
542 parent_slot, cow_ret, search_start, 0); 537 parent_slot, cow_ret, search_start, 0);
538
539 trace_btrfs_cow_block(root, buf, *cow_ret);
540
543 return ret; 541 return ret;
544} 542}
545 543
@@ -684,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
684 if (!cur) { 682 if (!cur) {
685 cur = read_tree_block(root, blocknr, 683 cur = read_tree_block(root, blocknr,
686 blocksize, gen); 684 blocksize, gen);
685 if (!cur)
686 return -EIO;
687 } else if (!uptodate) { 687 } else if (!uptodate) {
688 btrfs_read_buffer(cur, gen); 688 btrfs_read_buffer(cur, gen);
689 } 689 }
@@ -730,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
730 return btrfs_item_offset_nr(leaf, nr - 1); 730 return btrfs_item_offset_nr(leaf, nr - 1);
731} 731}
732 732
733/*
734 * extra debugging checks to make sure all the items in a key are
735 * well formed and in the proper order
736 */
737static int check_node(struct btrfs_root *root, struct btrfs_path *path,
738 int level)
739{
740 struct extent_buffer *parent = NULL;
741 struct extent_buffer *node = path->nodes[level];
742 struct btrfs_disk_key parent_key;
743 struct btrfs_disk_key node_key;
744 int parent_slot;
745 int slot;
746 struct btrfs_key cpukey;
747 u32 nritems = btrfs_header_nritems(node);
748
749 if (path->nodes[level + 1])
750 parent = path->nodes[level + 1];
751
752 slot = path->slots[level];
753 BUG_ON(nritems == 0);
754 if (parent) {
755 parent_slot = path->slots[level + 1];
756 btrfs_node_key(parent, &parent_key, parent_slot);
757 btrfs_node_key(node, &node_key, 0);
758 BUG_ON(memcmp(&parent_key, &node_key,
759 sizeof(struct btrfs_disk_key)));
760 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
761 btrfs_header_bytenr(node));
762 }
763 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
764 if (slot != 0) {
765 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
766 btrfs_node_key(node, &node_key, slot);
767 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
768 }
769 if (slot < nritems - 1) {
770 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
771 btrfs_node_key(node, &node_key, slot);
772 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
773 }
774 return 0;
775}
776
777/*
778 * extra checking to make sure all the items in a leaf are
779 * well formed and in the proper order
780 */
781static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
782 int level)
783{
784 struct extent_buffer *leaf = path->nodes[level];
785 struct extent_buffer *parent = NULL;
786 int parent_slot;
787 struct btrfs_key cpukey;
788 struct btrfs_disk_key parent_key;
789 struct btrfs_disk_key leaf_key;
790 int slot = path->slots[0];
791
792 u32 nritems = btrfs_header_nritems(leaf);
793
794 if (path->nodes[level + 1])
795 parent = path->nodes[level + 1];
796
797 if (nritems == 0)
798 return 0;
799
800 if (parent) {
801 parent_slot = path->slots[level + 1];
802 btrfs_node_key(parent, &parent_key, parent_slot);
803 btrfs_item_key(leaf, &leaf_key, 0);
804
805 BUG_ON(memcmp(&parent_key, &leaf_key,
806 sizeof(struct btrfs_disk_key)));
807 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
808 btrfs_header_bytenr(leaf));
809 }
810 if (slot != 0 && slot < nritems - 1) {
811 btrfs_item_key(leaf, &leaf_key, slot);
812 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
813 if (comp_keys(&leaf_key, &cpukey) <= 0) {
814 btrfs_print_leaf(root, leaf);
815 printk(KERN_CRIT "slot %d offset bad key\n", slot);
816 BUG_ON(1);
817 }
818 if (btrfs_item_offset_nr(leaf, slot - 1) !=
819 btrfs_item_end_nr(leaf, slot)) {
820 btrfs_print_leaf(root, leaf);
821 printk(KERN_CRIT "slot %d offset bad\n", slot);
822 BUG_ON(1);
823 }
824 }
825 if (slot < nritems - 1) {
826 btrfs_item_key(leaf, &leaf_key, slot);
827 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
828 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
829 if (btrfs_item_offset_nr(leaf, slot) !=
830 btrfs_item_end_nr(leaf, slot + 1)) {
831 btrfs_print_leaf(root, leaf);
832 printk(KERN_CRIT "slot %d offset bad\n", slot);
833 BUG_ON(1);
834 }
835 }
836 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
837 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
838 return 0;
839}
840
841static noinline int check_block(struct btrfs_root *root,
842 struct btrfs_path *path, int level)
843{
844 return 0;
845 if (level == 0)
846 return check_leaf(root, path, level);
847 return check_node(root, path, level);
848}
849 733
850/* 734/*
851 * search for key in the extent_buffer. The items start at offset p, 735 * search for key in the extent_buffer. The items start at offset p,
@@ -1044,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1044 goto enospc; 928 goto enospc;
1045 } 929 }
1046 930
1047 spin_lock(&root->node_lock); 931 rcu_assign_pointer(root->node, child);
1048 root->node = child;
1049 spin_unlock(&root->node_lock);
1050 932
1051 add_root_to_dirty_list(root); 933 add_root_to_dirty_list(root);
1052 btrfs_tree_unlock(child); 934 btrfs_tree_unlock(child);
@@ -1186,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1186 } 1068 }
1187 } 1069 }
1188 /* double check we haven't messed things up */ 1070 /* double check we haven't messed things up */
1189 check_block(root, path, level);
1190 if (orig_ptr != 1071 if (orig_ptr !=
1191 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1072 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1192 BUG(); 1073 BUG();
@@ -1796,12 +1677,6 @@ cow_done:
1796 if (!cow) 1677 if (!cow)
1797 btrfs_unlock_up_safe(p, level + 1); 1678 btrfs_unlock_up_safe(p, level + 1);
1798 1679
1799 ret = check_block(root, p, level);
1800 if (ret) {
1801 ret = -1;
1802 goto done;
1803 }
1804
1805 ret = bin_search(b, key, level, &slot); 1680 ret = bin_search(b, key, level, &slot);
1806 1681
1807 if (level != 0) { 1682 if (level != 0) {
@@ -2128,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2128 2003
2129 btrfs_mark_buffer_dirty(c); 2004 btrfs_mark_buffer_dirty(c);
2130 2005
2131 spin_lock(&root->node_lock);
2132 old = root->node; 2006 old = root->node;
2133 root->node = c; 2007 rcu_assign_pointer(root->node, c);
2134 spin_unlock(&root->node_lock);
2135 2008
2136 /* the super has an extra ref to root->node */ 2009 /* the super has an extra ref to root->node */
2137 free_extent_buffer(old); 2010 free_extent_buffer(old);
@@ -2514,6 +2387,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2387 btrfs_assert_tree_locked(path->nodes[1]);
2515 2388
2516 right = read_node_slot(root, upper, slot + 1); 2389 right = read_node_slot(root, upper, slot + 1);
2390 if (right == NULL)
2391 return 1;
2392
2517 btrfs_tree_lock(right); 2393 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2394 btrfs_set_lock_blocking(right);
2519 2395
@@ -2764,6 +2640,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2640 btrfs_assert_tree_locked(path->nodes[1]);
2765 2641
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2642 left = read_node_slot(root, path->nodes[1], slot - 1);
2643 if (left == NULL)
2644 return 1;
2645
2767 btrfs_tree_lock(left); 2646 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2647 btrfs_set_lock_blocking(left);
2769 2648
@@ -3832,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3832 unsigned long ptr; 3711 unsigned long ptr;
3833 3712
3834 path = btrfs_alloc_path(); 3713 path = btrfs_alloc_path();
3835 BUG_ON(!path); 3714 if (!path)
3715 return -ENOMEM;
3836 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3716 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3837 if (!ret) { 3717 if (!ret) {
3838 leaf = path->nodes[0]; 3718 leaf = path->nodes[0];
@@ -4209,6 +4089,7 @@ find_next_key:
4209 } 4089 }
4210 btrfs_set_path_blocking(path); 4090 btrfs_set_path_blocking(path);
4211 cur = read_node_slot(root, cur, slot); 4091 cur = read_node_slot(root, cur, slot);
4092 BUG_ON(!cur);
4212 4093
4213 btrfs_tree_lock(cur); 4094 btrfs_tree_lock(cur);
4214 4095
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b875d445ea81..2e61fe1b6b8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h>
31#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
32#include "extent_io.h" 33#include "extent_io.h"
33#include "extent_map.h" 34#include "extent_map.h"
@@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
40extern struct kmem_cache *btrfs_transaction_cachep; 41extern struct kmem_cache *btrfs_transaction_cachep;
41extern struct kmem_cache *btrfs_bit_radix_cachep; 42extern struct kmem_cache *btrfs_bit_radix_cachep;
42extern struct kmem_cache *btrfs_path_cachep; 43extern struct kmem_cache *btrfs_path_cachep;
44extern struct kmem_cache *btrfs_free_space_cachep;
43struct btrfs_ordered_sum; 45struct btrfs_ordered_sum;
44 46
45#define BTRFS_MAGIC "_BHRfS_M" 47#define BTRFS_MAGIC "_BHRfS_M"
@@ -295,6 +297,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
295#define BTRFS_FSID_SIZE 16 297#define BTRFS_FSID_SIZE 16
296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 298#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 299#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
300
301/*
302 * File system states
303 */
304
305/* Errors detected */
306#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
307
298#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 308#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
299#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 309#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
300 310
@@ -399,13 +409,15 @@ struct btrfs_super_block {
399#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
400#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 410#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
401#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 411#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
412#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
402 413
403#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 414#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
404#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 415#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
405#define BTRFS_FEATURE_INCOMPAT_SUPP \ 416#define BTRFS_FEATURE_INCOMPAT_SUPP \
406 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 417 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
407 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 418 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
408 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 419 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
420 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
409 421
410/* 422/*
411 * A leaf is full of items. offset and size tell us where to find 423 * A leaf is full of items. offset and size tell us where to find
@@ -552,9 +564,11 @@ struct btrfs_timespec {
552} __attribute__ ((__packed__)); 564} __attribute__ ((__packed__));
553 565
554enum btrfs_compression_type { 566enum btrfs_compression_type {
555 BTRFS_COMPRESS_NONE = 0, 567 BTRFS_COMPRESS_NONE = 0,
556 BTRFS_COMPRESS_ZLIB = 1, 568 BTRFS_COMPRESS_ZLIB = 1,
557 BTRFS_COMPRESS_LAST = 2, 569 BTRFS_COMPRESS_LZO = 2,
570 BTRFS_COMPRESS_TYPES = 2,
571 BTRFS_COMPRESS_LAST = 3,
558}; 572};
559 573
560struct btrfs_inode_item { 574struct btrfs_inode_item {
@@ -598,6 +612,8 @@ struct btrfs_dir_item {
598 u8 type; 612 u8 type;
599} __attribute__ ((__packed__)); 613} __attribute__ ((__packed__));
600 614
615#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
616
601struct btrfs_root_item { 617struct btrfs_root_item {
602 struct btrfs_inode_item inode; 618 struct btrfs_inode_item inode;
603 __le64 generation; 619 __le64 generation;
@@ -715,8 +731,19 @@ struct btrfs_space_info {
715 u64 disk_total; /* total bytes on disk, takes mirrors into 731 u64 disk_total; /* total bytes on disk, takes mirrors into
716 account */ 732 account */
717 733
718 int full; /* indicates that we cannot allocate any more 734 /*
735 * we bump reservation progress every time we decrement
736 * bytes_reserved. This way people waiting for reservations
737 * know something good has happened and they can check
738 * for progress. The number here isn't to be trusted, it
739 * just shows reclaim activity
740 */
741 unsigned long reservation_progress;
742
743 int full:1; /* indicates that we cannot allocate any more
719 chunks for this space */ 744 chunks for this space */
745 int chunk_alloc:1; /* set if we are allocating a chunk */
746
720 int force_alloc; /* set if we need to force a chunk alloc for 747 int force_alloc; /* set if we need to force a chunk alloc for
721 this space */ 748 this space */
722 749
@@ -759,9 +786,6 @@ struct btrfs_free_cluster {
759 /* first extent starting offset */ 786 /* first extent starting offset */
760 u64 window_start; 787 u64 window_start;
761 788
762 /* if this cluster simply points at a bitmap in the block group */
763 bool points_to_bitmap;
764
765 struct btrfs_block_group_cache *block_group; 789 struct btrfs_block_group_cache *block_group;
766 /* 790 /*
767 * when a cluster is allocated from a block group, we put the 791 * when a cluster is allocated from a block group, we put the
@@ -896,7 +920,8 @@ struct btrfs_fs_info {
896 */ 920 */
897 u64 last_trans_log_full_commit; 921 u64 last_trans_log_full_commit;
898 u64 open_ioctl_trans; 922 u64 open_ioctl_trans;
899 unsigned long mount_opt; 923 unsigned long mount_opt:20;
924 unsigned long compress_type:4;
900 u64 max_inline; 925 u64 max_inline;
901 u64 alloc_start; 926 u64 alloc_start;
902 struct btrfs_transaction *running_transaction; 927 struct btrfs_transaction *running_transaction;
@@ -1051,6 +1076,9 @@ struct btrfs_fs_info {
1051 unsigned metadata_ratio; 1076 unsigned metadata_ratio;
1052 1077
1053 void *bdev_holder; 1078 void *bdev_holder;
1079
1080 /* filesystem state */
1081 u64 fs_state;
1054}; 1082};
1055 1083
1056/* 1084/*
@@ -1236,6 +1264,7 @@ struct btrfs_root {
1236#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) 1264#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1237#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1265#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1238#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1266#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1267#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1239 1268
1240#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1269#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1241#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1270#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1255,6 +1284,9 @@ struct btrfs_root {
1255#define BTRFS_INODE_NODUMP (1 << 8) 1284#define BTRFS_INODE_NODUMP (1 << 8)
1256#define BTRFS_INODE_NOATIME (1 << 9) 1285#define BTRFS_INODE_NOATIME (1 << 9)
1257#define BTRFS_INODE_DIRSYNC (1 << 10) 1286#define BTRFS_INODE_DIRSYNC (1 << 10)
1287#define BTRFS_INODE_COMPRESS (1 << 11)
1288
1289#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
1258 1290
1259/* some macros to generate set/get funcs for the struct fields. This 1291/* some macros to generate set/get funcs for the struct fields. This
1260 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1292 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -1894,6 +1926,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1894BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1926BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1895 last_snapshot, 64); 1927 last_snapshot, 64);
1896 1928
1929static inline bool btrfs_root_readonly(struct btrfs_root *root)
1930{
1931 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1932}
1933
1897/* struct btrfs_super_block */ 1934/* struct btrfs_super_block */
1898 1935
1899BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1936BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2124,6 +2161,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2124 u64 root_objectid, u64 owner, u64 offset); 2161 u64 root_objectid, u64 owner, u64 offset);
2125 2162
2126int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2163int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2164int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
2165 u64 num_bytes, int reserve, int sinfo);
2127int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2166int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2128 struct btrfs_root *root); 2167 struct btrfs_root *root);
2129int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2168int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2146,6 +2185,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2146int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2185int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2147 struct btrfs_root *root, u64 group_start); 2186 struct btrfs_root *root, u64 group_start);
2148u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2187u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2188u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2149void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2189void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2150void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2190void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2151int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2191int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2189,6 +2229,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2189int btrfs_set_block_group_rw(struct btrfs_root *root, 2229int btrfs_set_block_group_rw(struct btrfs_root *root,
2190 struct btrfs_block_group_cache *cache); 2230 struct btrfs_block_group_cache *cache);
2191void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2231void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2232u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2233int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2234 u64 start, u64 end);
2235int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2236 u64 num_bytes, u64 *actual_bytes);
2237int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2238 struct btrfs_root *root, u64 type);
2239int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2240
2241int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2192/* ctree.c */ 2242/* ctree.c */
2193int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2243int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2194 int level, int *slot); 2244 int level, int *slot);
@@ -2313,6 +2363,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2313int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2363int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2314int btrfs_set_root_node(struct btrfs_root_item *item, 2364int btrfs_set_root_node(struct btrfs_root_item *item,
2315 struct extent_buffer *node); 2365 struct extent_buffer *node);
2366void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2367
2316/* dir-item.c */ 2368/* dir-item.c */
2317int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2369int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2318 struct btrfs_root *root, const char *name, 2370 struct btrfs_root *root, const char *name,
@@ -2350,6 +2402,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
2350 struct btrfs_path *path, u64 dir, 2402 struct btrfs_path *path, u64 dir,
2351 const char *name, u16 name_len, 2403 const char *name, u16 name_len,
2352 int mod); 2404 int mod);
2405int verify_dir_item(struct btrfs_root *root,
2406 struct extent_buffer *leaf,
2407 struct btrfs_dir_item *dir_item);
2353 2408
2354/* orphan.c */ 2409/* orphan.c */
2355int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 2410int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2486,7 +2541,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2486 struct inode *inode); 2541 struct inode *inode);
2487int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2542int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2488int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2543int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2489void btrfs_orphan_cleanup(struct btrfs_root *root); 2544int btrfs_orphan_cleanup(struct btrfs_root *root);
2490void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2545void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2491 struct btrfs_pending_snapshot *pending, 2546 struct btrfs_pending_snapshot *pending,
2492 u64 *bytes_to_reserve); 2547 u64 *bytes_to_reserve);
@@ -2494,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2494 struct btrfs_pending_snapshot *pending); 2549 struct btrfs_pending_snapshot *pending);
2495void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2550void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root); 2551 struct btrfs_root *root);
2497int btrfs_cont_expand(struct inode *inode, loff_t size); 2552int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
2498int btrfs_invalidate_inodes(struct btrfs_root *root); 2553int btrfs_invalidate_inodes(struct btrfs_root *root);
2499void btrfs_add_delayed_iput(struct inode *inode); 2554void btrfs_add_delayed_iput(struct inode *inode);
2500void btrfs_run_delayed_iputs(struct btrfs_root *root); 2555void btrfs_run_delayed_iputs(struct btrfs_root *root);
@@ -2523,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
2523int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2578int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2524 struct inode *inode, u64 start, u64 end); 2579 struct inode *inode, u64 start, u64 end);
2525int btrfs_release_file(struct inode *inode, struct file *file); 2580int btrfs_release_file(struct inode *inode, struct file *file);
2581void btrfs_drop_pages(struct page **pages, size_t num_pages);
2582int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
2583 struct page **pages, size_t num_pages,
2584 loff_t pos, size_t write_bytes,
2585 struct extent_state **cached);
2526 2586
2527/* tree-defrag.c */ 2587/* tree-defrag.c */
2528int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 2588int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -2542,6 +2602,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2542/* super.c */ 2602/* super.c */
2543int btrfs_parse_options(struct btrfs_root *root, char *options); 2603int btrfs_parse_options(struct btrfs_root *root, char *options);
2544int btrfs_sync_fs(struct super_block *sb, int wait); 2604int btrfs_sync_fs(struct super_block *sb, int wait);
2605void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2606 unsigned int line, int errno);
2607
2608#define btrfs_std_error(fs_info, errno) \
2609do { \
2610 if ((errno)) \
2611 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2612} while (0)
2545 2613
2546/* acl.c */ 2614/* acl.c */
2547#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2615#ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..bce28f653899 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
483 INIT_LIST_HEAD(&head_ref->cluster); 483 INIT_LIST_HEAD(&head_ref->cluster);
484 mutex_init(&head_ref->mutex); 484 mutex_init(&head_ref->mutex);
485 485
486 trace_btrfs_delayed_ref_head(ref, head_ref, action);
487
486 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 488 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
487 489
488 if (existing) { 490 if (existing) {
@@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
537 } 539 }
538 full_ref->level = level; 540 full_ref->level = level;
539 541
542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
543
540 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 544 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
541 545
542 if (existing) { 546 if (existing) {
@@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
591 full_ref->objectid = owner; 595 full_ref->objectid = owner;
592 full_ref->offset = offset; 596 full_ref->offset = offset;
593 597
598 trace_btrfs_delayed_data_ref(ref, full_ref, action);
599
594 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 600 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
595 601
596 if (existing) { 602 if (existing) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be7..c62f02f6ae69 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
151 ret = PTR_ERR(dir_item); 151 ret = PTR_ERR(dir_item);
152 if (ret == -EEXIST) 152 if (ret == -EEXIST)
153 goto second_insert; 153 goto second_insert;
154 goto out; 154 goto out_free;
155 } 155 }
156 156
157 leaf = path->nodes[0]; 157 leaf = path->nodes[0];
@@ -170,7 +170,7 @@ second_insert:
170 /* FIXME, use some real flag for selecting the extra index */ 170 /* FIXME, use some real flag for selecting the extra index */
171 if (root == root->fs_info->tree_root) { 171 if (root == root->fs_info->tree_root) {
172 ret = 0; 172 ret = 0;
173 goto out; 173 goto out_free;
174 } 174 }
175 btrfs_release_path(root, path); 175 btrfs_release_path(root, path);
176 176
@@ -180,7 +180,7 @@ second_insert:
180 name, name_len); 180 name, name_len);
181 if (IS_ERR(dir_item)) { 181 if (IS_ERR(dir_item)) {
182 ret2 = PTR_ERR(dir_item); 182 ret2 = PTR_ERR(dir_item);
183 goto out; 183 goto out_free;
184 } 184 }
185 leaf = path->nodes[0]; 185 leaf = path->nodes[0];
186 btrfs_cpu_key_to_disk(&disk_key, location); 186 btrfs_cpu_key_to_disk(&disk_key, location);
@@ -192,7 +192,9 @@ second_insert:
192 name_ptr = (unsigned long)(dir_item + 1); 192 name_ptr = (unsigned long)(dir_item + 1);
193 write_extent_buffer(leaf, name, name_ptr, name_len); 193 write_extent_buffer(leaf, name, name_ptr, name_len);
194 btrfs_mark_buffer_dirty(leaf); 194 btrfs_mark_buffer_dirty(leaf);
195out: 195
196out_free:
197
196 btrfs_free_path(path); 198 btrfs_free_path(path);
197 if (ret) 199 if (ret)
198 return ret; 200 return ret;
@@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
377 379
378 leaf = path->nodes[0]; 380 leaf = path->nodes[0];
379 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 381 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
382 if (verify_dir_item(root, leaf, dir_item))
383 return NULL;
384
380 total_len = btrfs_item_size_nr(leaf, path->slots[0]); 385 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
381 while (cur < total_len) { 386 while (cur < total_len) {
382 this_len = sizeof(*dir_item) + 387 this_len = sizeof(*dir_item) +
@@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
429 } 434 }
430 return ret; 435 return ret;
431} 436}
437
438int verify_dir_item(struct btrfs_root *root,
439 struct extent_buffer *leaf,
440 struct btrfs_dir_item *dir_item)
441{
442 u16 namelen = BTRFS_NAME_LEN;
443 u8 type = btrfs_dir_type(leaf, dir_item);
444
445 if (type >= BTRFS_FT_MAX) {
446 printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
447 (int)type);
448 return 1;
449 }
450
451 if (type == BTRFS_FT_XATTR)
452 namelen = XATTR_NAME_MAX;
453
454 if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
455 printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n",
456 (unsigned)btrfs_dir_data_len(leaf, dir_item));
457 return 1;
458 }
459
460 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
461 if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
462 printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
463 (unsigned)btrfs_dir_data_len(leaf, dir_item));
464 return 1;
465 }
466
467 return 0;
468}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h> 31#include <linux/migrate.h>
32#include <asm/unaligned.h>
32#include "compat.h" 33#include "compat.h"
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
@@ -44,6 +45,20 @@
44static struct extent_io_ops btree_extent_io_ops; 45static struct extent_io_ops btree_extent_io_ops;
45static void end_workqueue_fn(struct btrfs_work *work); 46static void end_workqueue_fn(struct btrfs_work *work);
46static void free_fs_root(struct btrfs_root *root); 47static void free_fs_root(struct btrfs_root *root);
48static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
49 int read_only);
50static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
51static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
52static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
53 struct btrfs_root *root);
54static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
55static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
56static int btrfs_destroy_marked_extents(struct btrfs_root *root,
57 struct extent_io_tree *dirty_pages,
58 int mark);
59static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
60 struct extent_io_tree *pinned_extents);
61static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 62
48/* 63/*
49 * end_io_wq structs are used to do processing in task context when an IO is 64 * end_io_wq structs are used to do processing in task context when an IO is
@@ -184,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
184 199
185void btrfs_csum_final(u32 crc, char *result) 200void btrfs_csum_final(u32 crc, char *result)
186{ 201{
187 *(__le32 *)result = ~cpu_to_le32(crc); 202 put_unaligned_le32(~crc, result);
188} 203}
189 204
190/* 205/*
@@ -309,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
309 int num_copies = 0; 324 int num_copies = 0;
310 int mirror_num = 0; 325 int mirror_num = 0;
311 326
327 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
312 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 328 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
313 while (1) { 329 while (1) {
314 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 330 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -317,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
317 !verify_parent_transid(io_tree, eb, parent_transid)) 333 !verify_parent_transid(io_tree, eb, parent_transid))
318 return ret; 334 return ret;
319 335
336 /*
337 * This buffer's crc is fine, but its contents are corrupted, so
338 * there is no reason to read the other copies, they won't be
339 * any less wrong.
340 */
341 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
342 return ret;
343
320 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 344 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
321 eb->start, eb->len); 345 eb->start, eb->len);
322 if (num_copies == 1) 346 if (num_copies == 1)
@@ -345,14 +369,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
345 369
346 tree = &BTRFS_I(page->mapping->host)->io_tree; 370 tree = &BTRFS_I(page->mapping->host)->io_tree;
347 371
348 if (page->private == EXTENT_PAGE_PRIVATE) 372 if (page->private == EXTENT_PAGE_PRIVATE) {
373 WARN_ON(1);
349 goto out; 374 goto out;
350 if (!page->private) 375 }
376 if (!page->private) {
377 WARN_ON(1);
351 goto out; 378 goto out;
379 }
352 len = page->private >> 2; 380 len = page->private >> 2;
353 WARN_ON(len == 0); 381 WARN_ON(len == 0);
354 382
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 383 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
384 if (eb == NULL) {
385 WARN_ON(1);
386 goto out;
387 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 388 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 389 btrfs_header_generation(eb));
358 BUG_ON(ret); 390 BUG_ON(ret);
@@ -397,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
397 return ret; 429 return ret;
398} 430}
399 431
432#define CORRUPT(reason, eb, root, slot) \
433 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
434 "root=%llu, slot=%d\n", reason, \
435 (unsigned long long)btrfs_header_bytenr(eb), \
436 (unsigned long long)root->objectid, slot)
437
438static noinline int check_leaf(struct btrfs_root *root,
439 struct extent_buffer *leaf)
440{
441 struct btrfs_key key;
442 struct btrfs_key leaf_key;
443 u32 nritems = btrfs_header_nritems(leaf);
444 int slot;
445
446 if (nritems == 0)
447 return 0;
448
449 /* Check the 0 item */
450 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
451 BTRFS_LEAF_DATA_SIZE(root)) {
452 CORRUPT("invalid item offset size pair", leaf, root, 0);
453 return -EIO;
454 }
455
456 /*
457 * Check to make sure each items keys are in the correct order and their
458 * offsets make sense. We only have to loop through nritems-1 because
459 * we check the current slot against the next slot, which verifies the
460 * next slot's offset+size makes sense and that the current's slot
461 * offset is correct.
462 */
463 for (slot = 0; slot < nritems - 1; slot++) {
464 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
465 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
466
467 /* Make sure the keys are in the right order */
468 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
469 CORRUPT("bad key order", leaf, root, slot);
470 return -EIO;
471 }
472
473 /*
474 * Make sure the offset and ends are right, remember that the
475 * item data starts at the end of the leaf and grows towards the
476 * front.
477 */
478 if (btrfs_item_offset_nr(leaf, slot) !=
479 btrfs_item_end_nr(leaf, slot + 1)) {
480 CORRUPT("slot offset bad", leaf, root, slot);
481 return -EIO;
482 }
483
484 /*
485 * Check to make sure that we don't point outside of the leaf,
486 * just incase all the items are consistent to eachother, but
487 * all point outside of the leaf.
488 */
489 if (btrfs_item_end_nr(leaf, slot) >
490 BTRFS_LEAF_DATA_SIZE(root)) {
491 CORRUPT("slot end outside of leaf", leaf, root, slot);
492 return -EIO;
493 }
494 }
495
496 return 0;
497}
498
400#ifdef CONFIG_DEBUG_LOCK_ALLOC 499#ifdef CONFIG_DEBUG_LOCK_ALLOC
401void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 500void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
402{ 501{
@@ -427,6 +526,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
427 WARN_ON(len == 0); 526 WARN_ON(len == 0);
428 527
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 528 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
529 if (eb == NULL) {
530 ret = -EIO;
531 goto out;
532 }
430 533
431 found_start = btrfs_header_bytenr(eb); 534 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 535 if (found_start != start) {
@@ -459,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
459 btrfs_set_buffer_lockdep_class(eb, found_level); 562 btrfs_set_buffer_lockdep_class(eb, found_level);
460 563
461 ret = csum_tree_block(root, eb, 1); 564 ret = csum_tree_block(root, eb, 1);
462 if (ret) 565 if (ret) {
566 ret = -EIO;
567 goto err;
568 }
569
570 /*
571 * If this is a leaf block and it is corrupt, set the corrupt bit so
572 * that we don't try and read the other copies of this block, just
573 * return -EIO.
574 */
575 if (found_level == 0 && check_leaf(root, eb)) {
576 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
463 ret = -EIO; 577 ret = -EIO;
578 }
464 579
465 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 580 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
466 end = eb->start + end - 1; 581 end = eb->start + end - 1;
@@ -821,7 +936,6 @@ static const struct address_space_operations btree_aops = {
821 .writepages = btree_writepages, 936 .writepages = btree_writepages,
822 .releasepage = btree_releasepage, 937 .releasepage = btree_releasepage,
823 .invalidatepage = btree_invalidatepage, 938 .invalidatepage = btree_invalidatepage,
824 .sync_page = block_sync_page,
825#ifdef CONFIG_MIGRATION 939#ifdef CONFIG_MIGRATION
826 .migratepage = btree_migratepage, 940 .migratepage = btree_migratepage,
827#endif 941#endif
@@ -1134,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1134 root, fs_info, location->objectid); 1248 root, fs_info, location->objectid);
1135 1249
1136 path = btrfs_alloc_path(); 1250 path = btrfs_alloc_path();
1137 BUG_ON(!path); 1251 if (!path) {
1252 kfree(root);
1253 return ERR_PTR(-ENOMEM);
1254 }
1138 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1255 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1139 if (ret == 0) { 1256 if (ret == 0) {
1140 l = path->nodes[0]; 1257 l = path->nodes[0];
@@ -1145,6 +1262,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1145 } 1262 }
1146 btrfs_free_path(path); 1263 btrfs_free_path(path);
1147 if (ret) { 1264 if (ret) {
1265 kfree(root);
1148 if (ret > 0) 1266 if (ret > 0)
1149 ret = -ENOENT; 1267 ret = -ENOENT;
1150 return ERR_PTR(ret); 1268 return ERR_PTR(ret);
@@ -1157,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1157 root->commit_root = btrfs_root_node(root); 1275 root->commit_root = btrfs_root_node(root);
1158 BUG_ON(!root->node); 1276 BUG_ON(!root->node);
1159out: 1277out:
1160 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1278 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1161 root->ref_cows = 1; 1279 root->ref_cows = 1;
1280 btrfs_check_and_init_root_item(&root->root_item);
1281 }
1162 1282
1163 return root; 1283 return root;
1164} 1284}
@@ -1304,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1304} 1424}
1305 1425
1306/* 1426/*
1307 * this unplugs every device on the box, and it is only used when page
1308 * is null
1309 */
1310static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1311{
1312 struct btrfs_device *device;
1313 struct btrfs_fs_info *info;
1314
1315 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1316 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1317 if (!device->bdev)
1318 continue;
1319
1320 bdi = blk_get_backing_dev_info(device->bdev);
1321 if (bdi->unplug_io_fn)
1322 bdi->unplug_io_fn(bdi, page);
1323 }
1324}
1325
1326static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1327{
1328 struct inode *inode;
1329 struct extent_map_tree *em_tree;
1330 struct extent_map *em;
1331 struct address_space *mapping;
1332 u64 offset;
1333
1334 /* the generic O_DIRECT read code does this */
1335 if (1 || !page) {
1336 __unplug_io_fn(bdi, page);
1337 return;
1338 }
1339
1340 /*
1341 * page->mapping may change at any time. Get a consistent copy
1342 * and use that for everything below
1343 */
1344 smp_mb();
1345 mapping = page->mapping;
1346 if (!mapping)
1347 return;
1348
1349 inode = mapping->host;
1350
1351 /*
1352 * don't do the expensive searching for a small number of
1353 * devices
1354 */
1355 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1356 __unplug_io_fn(bdi, page);
1357 return;
1358 }
1359
1360 offset = page_offset(page);
1361
1362 em_tree = &BTRFS_I(inode)->extent_tree;
1363 read_lock(&em_tree->lock);
1364 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1365 read_unlock(&em_tree->lock);
1366 if (!em) {
1367 __unplug_io_fn(bdi, page);
1368 return;
1369 }
1370
1371 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1372 free_extent_map(em);
1373 __unplug_io_fn(bdi, page);
1374 return;
1375 }
1376 offset = offset - em->start;
1377 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1378 em->block_start + offset, page);
1379 free_extent_map(em);
1380}
1381
1382/*
1383 * If this fails, caller must call bdi_destroy() to get rid of the 1427 * If this fails, caller must call bdi_destroy() to get rid of the
1384 * bdi again. 1428 * bdi again.
1385 */ 1429 */
@@ -1393,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1393 return err; 1437 return err;
1394 1438
1395 bdi->ra_pages = default_backing_dev_info.ra_pages; 1439 bdi->ra_pages = default_backing_dev_info.ra_pages;
1396 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1397 bdi->unplug_io_data = info;
1398 bdi->congested_fn = btrfs_congested_fn; 1440 bdi->congested_fn = btrfs_congested_fn;
1399 bdi->congested_data = info; 1441 bdi->congested_data = info;
1400 return 0; 1442 return 0;
@@ -1527,6 +1569,7 @@ static int transaction_kthread(void *arg)
1527 spin_unlock(&root->fs_info->new_trans_lock); 1569 spin_unlock(&root->fs_info->new_trans_lock);
1528 1570
1529 trans = btrfs_join_transaction(root, 1); 1571 trans = btrfs_join_transaction(root, 1);
1572 BUG_ON(IS_ERR(trans));
1530 if (transid == trans->transid) { 1573 if (transid == trans->transid) {
1531 ret = btrfs_commit_transaction(trans, root); 1574 ret = btrfs_commit_transaction(trans, root);
1532 BUG_ON(ret); 1575 BUG_ON(ret);
@@ -1604,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 goto fail_bdi; 1647 goto fail_bdi;
1605 } 1648 }
1606 1649
1650 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1651
1607 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1652 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1608 INIT_LIST_HEAD(&fs_info->trans_list); 1653 INIT_LIST_HEAD(&fs_info->trans_list);
1609 INIT_LIST_HEAD(&fs_info->dead_roots); 1654 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1713,8 +1758,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1713 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1758 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1714 1759
1715 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1760 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1716 if (!bh) 1761 if (!bh) {
1762 err = -EINVAL;
1717 goto fail_iput; 1763 goto fail_iput;
1764 }
1718 1765
1719 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1766 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1720 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1767 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1774,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1727 if (!btrfs_super_root(disk_super)) 1774 if (!btrfs_super_root(disk_super))
1728 goto fail_iput; 1775 goto fail_iput;
1729 1776
1777 /* check FS state, whether FS is broken. */
1778 fs_info->fs_state |= btrfs_super_flags(disk_super);
1779
1780 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1781
1782 /*
1783 * In the long term, we'll store the compression type in the super
1784 * block, and it'll be used for per file compression control.
1785 */
1786 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1787
1730 ret = btrfs_parse_options(tree_root, options); 1788 ret = btrfs_parse_options(tree_root, options);
1731 if (ret) { 1789 if (ret) {
1732 err = ret; 1790 err = ret;
@@ -1744,10 +1802,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1744 } 1802 }
1745 1803
1746 features = btrfs_super_incompat_flags(disk_super); 1804 features = btrfs_super_incompat_flags(disk_super);
1747 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1805 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1748 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1806 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1749 btrfs_set_super_incompat_flags(disk_super, features); 1807 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1750 } 1808 btrfs_set_super_incompat_flags(disk_super, features);
1751 1809
1752 features = btrfs_super_compat_ro_flags(disk_super) & 1810 features = btrfs_super_compat_ro_flags(disk_super) &
1753 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1811 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1932,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1932 fs_info->metadata_alloc_profile = (u64)-1; 1990 fs_info->metadata_alloc_profile = (u64)-1;
1933 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1991 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1934 1992
1993 ret = btrfs_init_space_info(fs_info);
1994 if (ret) {
1995 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1996 goto fail_block_groups;
1997 }
1998
1935 ret = btrfs_read_block_groups(extent_root); 1999 ret = btrfs_read_block_groups(extent_root);
1936 if (ret) { 2000 if (ret) {
1937 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2001 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1957,7 +2021,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1957 btrfs_set_opt(fs_info->mount_opt, SSD); 2021 btrfs_set_opt(fs_info->mount_opt, SSD);
1958 } 2022 }
1959 2023
1960 if (btrfs_super_log_root(disk_super) != 0) { 2024 /* do not make disk changes in broken FS */
2025 if (btrfs_super_log_root(disk_super) != 0 &&
2026 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1961 u64 bytenr = btrfs_super_log_root(disk_super); 2027 u64 bytenr = btrfs_super_log_root(disk_super);
1962 2028
1963 if (fs_devices->rw_devices == 0) { 2029 if (fs_devices->rw_devices == 0) {
@@ -2021,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2021 2087
2022 if (!(sb->s_flags & MS_RDONLY)) { 2088 if (!(sb->s_flags & MS_RDONLY)) {
2023 down_read(&fs_info->cleanup_work_sem); 2089 down_read(&fs_info->cleanup_work_sem);
2024 btrfs_orphan_cleanup(fs_info->fs_root); 2090 err = btrfs_orphan_cleanup(fs_info->fs_root);
2025 btrfs_orphan_cleanup(fs_info->tree_root); 2091 if (!err)
2092 err = btrfs_orphan_cleanup(fs_info->tree_root);
2026 up_read(&fs_info->cleanup_work_sem); 2093 up_read(&fs_info->cleanup_work_sem);
2094 if (err) {
2095 close_ctree(tree_root);
2096 return ERR_PTR(err);
2097 }
2027 } 2098 }
2028 2099
2029 return tree_root; 2100 return tree_root;
@@ -2398,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2398 2469
2399 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2470 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2400 for (i = 0; i < ret; i++) { 2471 for (i = 0; i < ret; i++) {
2472 int err;
2473
2401 root_objectid = gang[i]->root_key.objectid; 2474 root_objectid = gang[i]->root_key.objectid;
2402 btrfs_orphan_cleanup(gang[i]); 2475 err = btrfs_orphan_cleanup(gang[i]);
2476 if (err)
2477 return err;
2403 } 2478 }
2404 root_objectid++; 2479 root_objectid++;
2405 } 2480 }
@@ -2421,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2421 up_write(&root->fs_info->cleanup_work_sem); 2496 up_write(&root->fs_info->cleanup_work_sem);
2422 2497
2423 trans = btrfs_join_transaction(root, 1); 2498 trans = btrfs_join_transaction(root, 1);
2499 if (IS_ERR(trans))
2500 return PTR_ERR(trans);
2424 ret = btrfs_commit_transaction(trans, root); 2501 ret = btrfs_commit_transaction(trans, root);
2425 BUG_ON(ret); 2502 BUG_ON(ret);
2426 /* run commit again to drop the original snapshot */ 2503 /* run commit again to drop the original snapshot */
2427 trans = btrfs_join_transaction(root, 1); 2504 trans = btrfs_join_transaction(root, 1);
2505 if (IS_ERR(trans))
2506 return PTR_ERR(trans);
2428 btrfs_commit_transaction(trans, root); 2507 btrfs_commit_transaction(trans, root);
2429 ret = btrfs_write_and_wait_transaction(NULL, root); 2508 ret = btrfs_write_and_wait_transaction(NULL, root);
2430 BUG_ON(ret); 2509 BUG_ON(ret);
@@ -2442,8 +2521,28 @@ int close_ctree(struct btrfs_root *root)
2442 smp_mb(); 2521 smp_mb();
2443 2522
2444 btrfs_put_block_group_cache(fs_info); 2523 btrfs_put_block_group_cache(fs_info);
2524
2525 /*
2526 * Here come 2 situations when btrfs is broken to flip readonly:
2527 *
2528 * 1. when btrfs flips readonly somewhere else before
2529 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2530 * and btrfs will skip to write sb directly to keep
2531 * ERROR state on disk.
2532 *
2533 * 2. when btrfs flips readonly just in btrfs_commit_super,
2534 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2535 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2536 * btrfs will cleanup all FS resources first and write sb then.
2537 */
2445 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2538 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2446 ret = btrfs_commit_super(root); 2539 ret = btrfs_commit_super(root);
2540 if (ret)
2541 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2542 }
2543
2544 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2545 ret = btrfs_error_commit_super(root);
2447 if (ret) 2546 if (ret)
2448 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2547 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2449 } 2548 }
@@ -2502,6 +2601,8 @@ int close_ctree(struct btrfs_root *root)
2502 kfree(fs_info->chunk_root); 2601 kfree(fs_info->chunk_root);
2503 kfree(fs_info->dev_root); 2602 kfree(fs_info->dev_root);
2504 kfree(fs_info->csum_root); 2603 kfree(fs_info->csum_root);
2604 kfree(fs_info);
2605
2505 return 0; 2606 return 0;
2506} 2607}
2507 2608
@@ -2619,6 +2720,355 @@ out:
2619 return 0; 2720 return 0;
2620} 2721}
2621 2722
2723static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2724 int read_only)
2725{
2726 if (read_only)
2727 return;
2728
2729 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2730 printk(KERN_WARNING "warning: mount fs with errors, "
2731 "running btrfsck is recommended\n");
2732}
2733
2734int btrfs_error_commit_super(struct btrfs_root *root)
2735{
2736 int ret;
2737
2738 mutex_lock(&root->fs_info->cleaner_mutex);
2739 btrfs_run_delayed_iputs(root);
2740 mutex_unlock(&root->fs_info->cleaner_mutex);
2741
2742 down_write(&root->fs_info->cleanup_work_sem);
2743 up_write(&root->fs_info->cleanup_work_sem);
2744
2745 /* cleanup FS via transaction */
2746 btrfs_cleanup_transaction(root);
2747
2748 ret = write_ctree_super(NULL, root, 0);
2749
2750 return ret;
2751}
2752
2753static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2754{
2755 struct btrfs_inode *btrfs_inode;
2756 struct list_head splice;
2757
2758 INIT_LIST_HEAD(&splice);
2759
2760 mutex_lock(&root->fs_info->ordered_operations_mutex);
2761 spin_lock(&root->fs_info->ordered_extent_lock);
2762
2763 list_splice_init(&root->fs_info->ordered_operations, &splice);
2764 while (!list_empty(&splice)) {
2765 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2766 ordered_operations);
2767
2768 list_del_init(&btrfs_inode->ordered_operations);
2769
2770 btrfs_invalidate_inodes(btrfs_inode->root);
2771 }
2772
2773 spin_unlock(&root->fs_info->ordered_extent_lock);
2774 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2775
2776 return 0;
2777}
2778
2779static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2780{
2781 struct list_head splice;
2782 struct btrfs_ordered_extent *ordered;
2783 struct inode *inode;
2784
2785 INIT_LIST_HEAD(&splice);
2786
2787 spin_lock(&root->fs_info->ordered_extent_lock);
2788
2789 list_splice_init(&root->fs_info->ordered_extents, &splice);
2790 while (!list_empty(&splice)) {
2791 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2792 root_extent_list);
2793
2794 list_del_init(&ordered->root_extent_list);
2795 atomic_inc(&ordered->refs);
2796
2797 /* the inode may be getting freed (in sys_unlink path). */
2798 inode = igrab(ordered->inode);
2799
2800 spin_unlock(&root->fs_info->ordered_extent_lock);
2801 if (inode)
2802 iput(inode);
2803
2804 atomic_set(&ordered->refs, 1);
2805 btrfs_put_ordered_extent(ordered);
2806
2807 spin_lock(&root->fs_info->ordered_extent_lock);
2808 }
2809
2810 spin_unlock(&root->fs_info->ordered_extent_lock);
2811
2812 return 0;
2813}
2814
2815static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2816 struct btrfs_root *root)
2817{
2818 struct rb_node *node;
2819 struct btrfs_delayed_ref_root *delayed_refs;
2820 struct btrfs_delayed_ref_node *ref;
2821 int ret = 0;
2822
2823 delayed_refs = &trans->delayed_refs;
2824
2825 spin_lock(&delayed_refs->lock);
2826 if (delayed_refs->num_entries == 0) {
2827 printk(KERN_INFO "delayed_refs has NO entry\n");
2828 return ret;
2829 }
2830
2831 node = rb_first(&delayed_refs->root);
2832 while (node) {
2833 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2834 node = rb_next(node);
2835
2836 ref->in_tree = 0;
2837 rb_erase(&ref->rb_node, &delayed_refs->root);
2838 delayed_refs->num_entries--;
2839
2840 atomic_set(&ref->refs, 1);
2841 if (btrfs_delayed_ref_is_head(ref)) {
2842 struct btrfs_delayed_ref_head *head;
2843
2844 head = btrfs_delayed_node_to_head(ref);
2845 mutex_lock(&head->mutex);
2846 kfree(head->extent_op);
2847 delayed_refs->num_heads--;
2848 if (list_empty(&head->cluster))
2849 delayed_refs->num_heads_ready--;
2850 list_del_init(&head->cluster);
2851 mutex_unlock(&head->mutex);
2852 }
2853
2854 spin_unlock(&delayed_refs->lock);
2855 btrfs_put_delayed_ref(ref);
2856
2857 cond_resched();
2858 spin_lock(&delayed_refs->lock);
2859 }
2860
2861 spin_unlock(&delayed_refs->lock);
2862
2863 return ret;
2864}
2865
2866static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2867{
2868 struct btrfs_pending_snapshot *snapshot;
2869 struct list_head splice;
2870
2871 INIT_LIST_HEAD(&splice);
2872
2873 list_splice_init(&t->pending_snapshots, &splice);
2874
2875 while (!list_empty(&splice)) {
2876 snapshot = list_entry(splice.next,
2877 struct btrfs_pending_snapshot,
2878 list);
2879
2880 list_del_init(&snapshot->list);
2881
2882 kfree(snapshot);
2883 }
2884
2885 return 0;
2886}
2887
2888static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2889{
2890 struct btrfs_inode *btrfs_inode;
2891 struct list_head splice;
2892
2893 INIT_LIST_HEAD(&splice);
2894
2895 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2896
2897 spin_lock(&root->fs_info->delalloc_lock);
2898
2899 while (!list_empty(&splice)) {
2900 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2901 delalloc_inodes);
2902
2903 list_del_init(&btrfs_inode->delalloc_inodes);
2904
2905 btrfs_invalidate_inodes(btrfs_inode->root);
2906 }
2907
2908 spin_unlock(&root->fs_info->delalloc_lock);
2909
2910 return 0;
2911}
2912
2913static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2914 struct extent_io_tree *dirty_pages,
2915 int mark)
2916{
2917 int ret;
2918 struct page *page;
2919 struct inode *btree_inode = root->fs_info->btree_inode;
2920 struct extent_buffer *eb;
2921 u64 start = 0;
2922 u64 end;
2923 u64 offset;
2924 unsigned long index;
2925
2926 while (1) {
2927 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2928 mark);
2929 if (ret)
2930 break;
2931
2932 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2933 while (start <= end) {
2934 index = start >> PAGE_CACHE_SHIFT;
2935 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2936 page = find_get_page(btree_inode->i_mapping, index);
2937 if (!page)
2938 continue;
2939 offset = page_offset(page);
2940
2941 spin_lock(&dirty_pages->buffer_lock);
2942 eb = radix_tree_lookup(
2943 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2944 offset >> PAGE_CACHE_SHIFT);
2945 spin_unlock(&dirty_pages->buffer_lock);
2946 if (eb) {
2947 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2948 &eb->bflags);
2949 atomic_set(&eb->refs, 1);
2950 }
2951 if (PageWriteback(page))
2952 end_page_writeback(page);
2953
2954 lock_page(page);
2955 if (PageDirty(page)) {
2956 clear_page_dirty_for_io(page);
2957 spin_lock_irq(&page->mapping->tree_lock);
2958 radix_tree_tag_clear(&page->mapping->page_tree,
2959 page_index(page),
2960 PAGECACHE_TAG_DIRTY);
2961 spin_unlock_irq(&page->mapping->tree_lock);
2962 }
2963
2964 page->mapping->a_ops->invalidatepage(page, 0);
2965 unlock_page(page);
2966 }
2967 }
2968
2969 return ret;
2970}
2971
2972static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2973 struct extent_io_tree *pinned_extents)
2974{
2975 struct extent_io_tree *unpin;
2976 u64 start;
2977 u64 end;
2978 int ret;
2979
2980 unpin = pinned_extents;
2981 while (1) {
2982 ret = find_first_extent_bit(unpin, 0, &start, &end,
2983 EXTENT_DIRTY);
2984 if (ret)
2985 break;
2986
2987 /* opt_discard */
2988 if (btrfs_test_opt(root, DISCARD))
2989 ret = btrfs_error_discard_extent(root, start,
2990 end + 1 - start,
2991 NULL);
2992
2993 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2994 btrfs_error_unpin_extent_range(root, start, end);
2995 cond_resched();
2996 }
2997
2998 return 0;
2999}
3000
3001static int btrfs_cleanup_transaction(struct btrfs_root *root)
3002{
3003 struct btrfs_transaction *t;
3004 LIST_HEAD(list);
3005
3006 WARN_ON(1);
3007
3008 mutex_lock(&root->fs_info->trans_mutex);
3009 mutex_lock(&root->fs_info->transaction_kthread_mutex);
3010
3011 list_splice_init(&root->fs_info->trans_list, &list);
3012 while (!list_empty(&list)) {
3013 t = list_entry(list.next, struct btrfs_transaction, list);
3014 if (!t)
3015 break;
3016
3017 btrfs_destroy_ordered_operations(root);
3018
3019 btrfs_destroy_ordered_extents(root);
3020
3021 btrfs_destroy_delayed_refs(t, root);
3022
3023 btrfs_block_rsv_release(root,
3024 &root->fs_info->trans_block_rsv,
3025 t->dirty_pages.dirty_bytes);
3026
3027 /* FIXME: cleanup wait for commit */
3028 t->in_commit = 1;
3029 t->blocked = 1;
3030 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3031 wake_up(&root->fs_info->transaction_blocked_wait);
3032
3033 t->blocked = 0;
3034 if (waitqueue_active(&root->fs_info->transaction_wait))
3035 wake_up(&root->fs_info->transaction_wait);
3036 mutex_unlock(&root->fs_info->trans_mutex);
3037
3038 mutex_lock(&root->fs_info->trans_mutex);
3039 t->commit_done = 1;
3040 if (waitqueue_active(&t->commit_wait))
3041 wake_up(&t->commit_wait);
3042 mutex_unlock(&root->fs_info->trans_mutex);
3043
3044 mutex_lock(&root->fs_info->trans_mutex);
3045
3046 btrfs_destroy_pending_snapshots(t);
3047
3048 btrfs_destroy_delalloc_inodes(root);
3049
3050 spin_lock(&root->fs_info->new_trans_lock);
3051 root->fs_info->running_transaction = NULL;
3052 spin_unlock(&root->fs_info->new_trans_lock);
3053
3054 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3055 EXTENT_DIRTY);
3056
3057 btrfs_destroy_pinned_extent(root,
3058 root->fs_info->pinned_extents);
3059
3060 atomic_set(&t->use_count, 0);
3061 list_del_init(&t->list);
3062 memset(t, 0, sizeof(*t));
3063 kmem_cache_free(btrfs_transaction_cachep, t);
3064 }
3065
3066 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3067 mutex_unlock(&root->fs_info->trans_mutex);
3068
3069 return 0;
3070}
3071
2622static struct extent_io_ops btree_extent_io_ops = { 3072static struct extent_io_ops btree_extent_io_ops = {
2623 .write_cache_pages_lock_hook = btree_lock_page_hook, 3073 .write_cache_pages_lock_hook = btree_lock_page_hook,
2624 .readpage_end_io_hook = btree_readpage_end_io_hook, 3074 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int len = *max_len; 21 int len = *max_len;
22 int type; 22 int type;
23 23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || 24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) 25 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 26 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
28 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 return 255;
30 }
27 31
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 33 type = FILEID_BTRFS_WITHOUT_PARENT;
@@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
171 int ret; 175 int ret;
172 176
173 path = btrfs_alloc_path(); 177 path = btrfs_alloc_path();
178 if (!path)
179 return ERR_PTR(-ENOMEM);
174 180
175 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 181 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
176 key.objectid = root->root_key.objectid; 182 key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..31f33ba56fe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
33#include "locking.h" 33#include "locking.h"
34#include "free-space-cache.h" 34#include "free-space-cache.h"
35 35
36/* control flags for do_chunk_alloc's force field
37 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
38 * if we really need one.
39 *
40 * CHUNK_ALLOC_FORCE means it must try to allocate one
41 *
42 * CHUNK_ALLOC_LIMITED means to only try and allocate one
43 * if we have very few chunks already allocated. This is
44 * used as part of the clustering code to help make sure
45 * we have a good pool of storage to cluster in, without
46 * filling the FS with empty chunks
47 *
48 */
49enum {
50 CHUNK_ALLOC_NO_FORCE = 0,
51 CHUNK_ALLOC_FORCE = 1,
52 CHUNK_ALLOC_LIMITED = 2,
53};
54
36static int update_block_group(struct btrfs_trans_handle *trans, 55static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 56 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc); 57 u64 bytenr, u64 num_bytes, int alloc);
39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve, int sinfo);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 58static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 59 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 60 u64 bytenr, u64 num_bytes, u64 parent,
@@ -320,11 +337,6 @@ static int caching_kthread(void *data)
320 if (!path) 337 if (!path)
321 return -ENOMEM; 338 return -ENOMEM;
322 339
323 exclude_super_stripes(extent_root, block_group);
324 spin_lock(&block_group->space_info->lock);
325 block_group->space_info->bytes_readonly += block_group->bytes_super;
326 spin_unlock(&block_group->space_info->lock);
327
328 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 340 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329 341
330 /* 342 /*
@@ -447,7 +459,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
447 * allocate blocks for the tree root we can't do the fast caching since 459 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks. 460 * we likely hold important locks.
449 */ 461 */
450 if (!trans->transaction->in_commit && 462 if (trans && (!trans->transaction->in_commit) &&
451 (root && root != root->fs_info->tree_root)) { 463 (root && root != root->fs_info->tree_root)) {
452 spin_lock(&cache->lock); 464 spin_lock(&cache->lock);
453 if (cache->cached != BTRFS_CACHE_NO) { 465 if (cache->cached != BTRFS_CACHE_NO) {
@@ -467,14 +479,16 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 cache->cached = BTRFS_CACHE_NO; 479 cache->cached = BTRFS_CACHE_NO;
468 } 480 }
469 spin_unlock(&cache->lock); 481 spin_unlock(&cache->lock);
470 if (ret == 1) 482 if (ret == 1) {
483 free_excluded_extents(fs_info->extent_root, cache);
471 return 0; 484 return 0;
485 }
472 } 486 }
473 487
474 if (load_cache_only) 488 if (load_cache_only)
475 return 0; 489 return 0;
476 490
477 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 491 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
478 BUG_ON(!caching_ctl); 492 BUG_ON(!caching_ctl);
479 493
480 INIT_LIST_HEAD(&caching_ctl->list); 494 INIT_LIST_HEAD(&caching_ctl->list);
@@ -1743,39 +1757,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1743 return ret; 1757 return ret;
1744} 1758}
1745 1759
1746static void btrfs_issue_discard(struct block_device *bdev, 1760static int btrfs_issue_discard(struct block_device *bdev,
1747 u64 start, u64 len) 1761 u64 start, u64 len)
1748{ 1762{
1749 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); 1763 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1750} 1764}
1751 1765
1752static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1766static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1753 u64 num_bytes) 1767 u64 num_bytes, u64 *actual_bytes)
1754{ 1768{
1755 int ret; 1769 int ret;
1756 u64 map_length = num_bytes; 1770 u64 discarded_bytes = 0;
1757 struct btrfs_multi_bio *multi = NULL; 1771 struct btrfs_multi_bio *multi = NULL;
1758 1772
1759 if (!btrfs_test_opt(root, DISCARD))
1760 return 0;
1761 1773
1762 /* Tell the block device(s) that the sectors can be discarded */ 1774 /* Tell the block device(s) that the sectors can be discarded */
1763 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1775 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1764 bytenr, &map_length, &multi, 0); 1776 bytenr, &num_bytes, &multi, 0);
1765 if (!ret) { 1777 if (!ret) {
1766 struct btrfs_bio_stripe *stripe = multi->stripes; 1778 struct btrfs_bio_stripe *stripe = multi->stripes;
1767 int i; 1779 int i;
1768 1780
1769 if (map_length > num_bytes)
1770 map_length = num_bytes;
1771 1781
1772 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1782 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1773 btrfs_issue_discard(stripe->dev->bdev, 1783 ret = btrfs_issue_discard(stripe->dev->bdev,
1774 stripe->physical, 1784 stripe->physical,
1775 map_length); 1785 stripe->length);
1786 if (!ret)
1787 discarded_bytes += stripe->length;
1788 else if (ret != -EOPNOTSUPP)
1789 break;
1776 } 1790 }
1777 kfree(multi); 1791 kfree(multi);
1778 } 1792 }
1793 if (discarded_bytes && ret == -EOPNOTSUPP)
1794 ret = 0;
1795
1796 if (actual_bytes)
1797 *actual_bytes = discarded_bytes;
1798
1779 1799
1780 return ret; 1800 return ret;
1781} 1801}
@@ -3018,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3018 found->bytes_readonly = 0; 3038 found->bytes_readonly = 0;
3019 found->bytes_may_use = 0; 3039 found->bytes_may_use = 0;
3020 found->full = 0; 3040 found->full = 0;
3021 found->force_alloc = 0; 3041 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3042 found->chunk_alloc = 0;
3022 *space_info = found; 3043 *space_info = found;
3023 list_add_rcu(&found->list, &info->space_info); 3044 list_add_rcu(&found->list, &info->space_info);
3024 atomic_set(&found->caching_threads, 0); 3045 atomic_set(&found->caching_threads, 0);
@@ -3089,7 +3110,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3089 return btrfs_reduce_alloc_profile(root, flags); 3110 return btrfs_reduce_alloc_profile(root, flags);
3090} 3111}
3091 3112
3092static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3113u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3093{ 3114{
3094 u64 flags; 3115 u64 flags;
3095 3116
@@ -3149,7 +3170,7 @@ again:
3149 if (!data_sinfo->full && alloc_chunk) { 3170 if (!data_sinfo->full && alloc_chunk) {
3150 u64 alloc_target; 3171 u64 alloc_target;
3151 3172
3152 data_sinfo->force_alloc = 1; 3173 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3153 spin_unlock(&data_sinfo->lock); 3174 spin_unlock(&data_sinfo->lock);
3154alloc: 3175alloc:
3155 alloc_target = btrfs_get_alloc_profile(root, 1); 3176 alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3159,10 +3180,15 @@ alloc:
3159 3180
3160 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3181 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3161 bytes + 2 * 1024 * 1024, 3182 bytes + 2 * 1024 * 1024,
3162 alloc_target, 0); 3183 alloc_target,
3184 CHUNK_ALLOC_NO_FORCE);
3163 btrfs_end_transaction(trans, root); 3185 btrfs_end_transaction(trans, root);
3164 if (ret < 0) 3186 if (ret < 0) {
3165 return ret; 3187 if (ret != -ENOSPC)
3188 return ret;
3189 else
3190 goto commit_trans;
3191 }
3166 3192
3167 if (!data_sinfo) { 3193 if (!data_sinfo) {
3168 btrfs_set_inode_space_info(root, inode); 3194 btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3199,7 @@ alloc:
3173 spin_unlock(&data_sinfo->lock); 3199 spin_unlock(&data_sinfo->lock);
3174 3200
3175 /* commit the current transaction and try again */ 3201 /* commit the current transaction and try again */
3202commit_trans:
3176 if (!committed && !root->fs_info->open_ioctl_trans) { 3203 if (!committed && !root->fs_info->open_ioctl_trans) {
3177 committed = 1; 3204 committed = 1;
3178 trans = btrfs_join_transaction(root, 1); 3205 trans = btrfs_join_transaction(root, 1);
@@ -3233,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3233 rcu_read_lock(); 3260 rcu_read_lock();
3234 list_for_each_entry_rcu(found, head, list) { 3261 list_for_each_entry_rcu(found, head, list) {
3235 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3262 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3236 found->force_alloc = 1; 3263 found->force_alloc = CHUNK_ALLOC_FORCE;
3237 } 3264 }
3238 rcu_read_unlock(); 3265 rcu_read_unlock();
3239} 3266}
3240 3267
3241static int should_alloc_chunk(struct btrfs_root *root, 3268static int should_alloc_chunk(struct btrfs_root *root,
3242 struct btrfs_space_info *sinfo, u64 alloc_bytes) 3269 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3270 int force)
3243{ 3271{
3244 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3272 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3273 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3245 u64 thresh; 3274 u64 thresh;
3246 3275
3247 if (sinfo->bytes_used + sinfo->bytes_reserved + 3276 if (force == CHUNK_ALLOC_FORCE)
3248 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3277 return 1;
3278
3279 /*
3280 * in limited mode, we want to have some free space up to
3281 * about 1% of the FS size.
3282 */
3283 if (force == CHUNK_ALLOC_LIMITED) {
3284 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3285 thresh = max_t(u64, 64 * 1024 * 1024,
3286 div_factor_fine(thresh, 1));
3287
3288 if (num_bytes - num_allocated < thresh)
3289 return 1;
3290 }
3291
3292 /*
3293 * we have two similar checks here, one based on percentage
3294 * and once based on a hard number of 256MB. The idea
3295 * is that if we have a good amount of free
3296 * room, don't allocate a chunk. A good mount is
3297 * less than 80% utilized of the chunks we have allocated,
3298 * or more than 256MB free
3299 */
3300 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3249 return 0; 3301 return 0;
3250 3302
3251 if (sinfo->bytes_used + sinfo->bytes_reserved + 3303 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3252 alloc_bytes < div_factor(num_bytes, 8))
3253 return 0; 3304 return 0;
3254 3305
3255 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3306 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3307
3308 /* 256MB or 5% of the FS */
3256 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3309 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3257 3310
3258 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3311 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3259 return 0; 3312 return 0;
3260
3261 return 1; 3313 return 1;
3262} 3314}
3263 3315
@@ -3267,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3267{ 3319{
3268 struct btrfs_space_info *space_info; 3320 struct btrfs_space_info *space_info;
3269 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3321 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3322 int wait_for_alloc = 0;
3270 int ret = 0; 3323 int ret = 0;
3271 3324
3272 mutex_lock(&fs_info->chunk_mutex);
3273
3274 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3325 flags = btrfs_reduce_alloc_profile(extent_root, flags);
3275 3326
3276 space_info = __find_space_info(extent_root->fs_info, flags); 3327 space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3281,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3281 } 3332 }
3282 BUG_ON(!space_info); 3333 BUG_ON(!space_info);
3283 3334
3335again:
3284 spin_lock(&space_info->lock); 3336 spin_lock(&space_info->lock);
3285 if (space_info->force_alloc) 3337 if (space_info->force_alloc)
3286 force = 1; 3338 force = space_info->force_alloc;
3287 if (space_info->full) { 3339 if (space_info->full) {
3288 spin_unlock(&space_info->lock); 3340 spin_unlock(&space_info->lock);
3289 goto out; 3341 return 0;
3290 } 3342 }
3291 3343
3292 if (!force && !should_alloc_chunk(extent_root, space_info, 3344 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3293 alloc_bytes)) {
3294 spin_unlock(&space_info->lock); 3345 spin_unlock(&space_info->lock);
3295 goto out; 3346 return 0;
3347 } else if (space_info->chunk_alloc) {
3348 wait_for_alloc = 1;
3349 } else {
3350 space_info->chunk_alloc = 1;
3296 } 3351 }
3352
3297 spin_unlock(&space_info->lock); 3353 spin_unlock(&space_info->lock);
3298 3354
3355 mutex_lock(&fs_info->chunk_mutex);
3356
3357 /*
3358 * The chunk_mutex is held throughout the entirety of a chunk
3359 * allocation, so once we've acquired the chunk_mutex we know that the
3360 * other guy is done and we need to recheck and see if we should
3361 * allocate.
3362 */
3363 if (wait_for_alloc) {
3364 mutex_unlock(&fs_info->chunk_mutex);
3365 wait_for_alloc = 0;
3366 goto again;
3367 }
3368
3299 /* 3369 /*
3300 * If we have mixed data/metadata chunks we want to make sure we keep 3370 * If we have mixed data/metadata chunks we want to make sure we keep
3301 * allocating mixed chunks instead of individual chunks. 3371 * allocating mixed chunks instead of individual chunks.
@@ -3321,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3321 space_info->full = 1; 3391 space_info->full = 1;
3322 else 3392 else
3323 ret = 1; 3393 ret = 1;
3324 space_info->force_alloc = 0; 3394
3395 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3396 space_info->chunk_alloc = 0;
3325 spin_unlock(&space_info->lock); 3397 spin_unlock(&space_info->lock);
3326out:
3327 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3398 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3328 return ret; 3399 return ret;
3329} 3400}
@@ -3339,21 +3410,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3339 u64 reserved; 3410 u64 reserved;
3340 u64 max_reclaim; 3411 u64 max_reclaim;
3341 u64 reclaimed = 0; 3412 u64 reclaimed = 0;
3342 int pause = 1; 3413 long time_left;
3343 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3414 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3415 int loops = 0;
3416 unsigned long progress;
3344 3417
3345 block_rsv = &root->fs_info->delalloc_block_rsv; 3418 block_rsv = &root->fs_info->delalloc_block_rsv;
3346 space_info = block_rsv->space_info; 3419 space_info = block_rsv->space_info;
3347 3420
3348 smp_mb(); 3421 smp_mb();
3349 reserved = space_info->bytes_reserved; 3422 reserved = space_info->bytes_reserved;
3423 progress = space_info->reservation_progress;
3350 3424
3351 if (reserved == 0) 3425 if (reserved == 0)
3352 return 0; 3426 return 0;
3353 3427
3354 max_reclaim = min(reserved, to_reclaim); 3428 max_reclaim = min(reserved, to_reclaim);
3355 3429
3356 while (1) { 3430 while (loops < 1024) {
3357 /* have the flusher threads jump in and do some IO */ 3431 /* have the flusher threads jump in and do some IO */
3358 smp_mb(); 3432 smp_mb();
3359 nr_pages = min_t(unsigned long, nr_pages, 3433 nr_pages = min_t(unsigned long, nr_pages,
@@ -3366,17 +3440,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3366 reserved = space_info->bytes_reserved; 3440 reserved = space_info->bytes_reserved;
3367 spin_unlock(&space_info->lock); 3441 spin_unlock(&space_info->lock);
3368 3442
3443 loops++;
3444
3369 if (reserved == 0 || reclaimed >= max_reclaim) 3445 if (reserved == 0 || reclaimed >= max_reclaim)
3370 break; 3446 break;
3371 3447
3372 if (trans && trans->transaction->blocked) 3448 if (trans && trans->transaction->blocked)
3373 return -EAGAIN; 3449 return -EAGAIN;
3374 3450
3375 __set_current_state(TASK_INTERRUPTIBLE); 3451 time_left = schedule_timeout_interruptible(1);
3376 schedule_timeout(pause); 3452
3377 pause <<= 1; 3453 /* We were interrupted, exit */
3378 if (pause > HZ / 10) 3454 if (time_left)
3379 pause = HZ / 10; 3455 break;
3456
3457 /* we've kicked the IO a few times, if anything has been freed,
3458 * exit. There is no sense in looping here for a long time
3459 * when we really need to commit the transaction, or there are
3460 * just too many writers without enough free space
3461 */
3462
3463 if (loops > 3) {
3464 smp_mb();
3465 if (progress != space_info->reservation_progress)
3466 break;
3467 }
3380 3468
3381 } 3469 }
3382 return reclaimed >= to_reclaim; 3470 return reclaimed >= to_reclaim;
@@ -3583,10 +3671,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3583 3671
3584 if (num_bytes > 0) { 3672 if (num_bytes > 0) {
3585 if (dest) { 3673 if (dest) {
3586 block_rsv_add_bytes(dest, num_bytes, 0); 3674 spin_lock(&dest->lock);
3587 } else { 3675 if (!dest->full) {
3676 u64 bytes_to_add;
3677
3678 bytes_to_add = dest->size - dest->reserved;
3679 bytes_to_add = min(num_bytes, bytes_to_add);
3680 dest->reserved += bytes_to_add;
3681 if (dest->reserved >= dest->size)
3682 dest->full = 1;
3683 num_bytes -= bytes_to_add;
3684 }
3685 spin_unlock(&dest->lock);
3686 }
3687 if (num_bytes) {
3588 spin_lock(&space_info->lock); 3688 spin_lock(&space_info->lock);
3589 space_info->bytes_reserved -= num_bytes; 3689 space_info->bytes_reserved -= num_bytes;
3690 space_info->reservation_progress++;
3590 spin_unlock(&space_info->lock); 3691 spin_unlock(&space_info->lock);
3591 } 3692 }
3592 } 3693 }
@@ -3721,11 +3822,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3721 return 0; 3822 return 0;
3722 } 3823 }
3723 3824
3724 WARN_ON(1);
3725 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3726 block_rsv->size, block_rsv->reserved,
3727 block_rsv->freed[0], block_rsv->freed[1]);
3728
3729 return -ENOSPC; 3825 return -ENOSPC;
3730} 3826}
3731 3827
@@ -3824,6 +3920,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3824 if (block_rsv->reserved >= block_rsv->size) { 3920 if (block_rsv->reserved >= block_rsv->size) {
3825 num_bytes = block_rsv->reserved - block_rsv->size; 3921 num_bytes = block_rsv->reserved - block_rsv->size;
3826 sinfo->bytes_reserved -= num_bytes; 3922 sinfo->bytes_reserved -= num_bytes;
3923 sinfo->reservation_progress++;
3827 block_rsv->reserved = block_rsv->size; 3924 block_rsv->reserved = block_rsv->size;
3828 block_rsv->full = 1; 3925 block_rsv->full = 1;
3829 } 3926 }
@@ -3968,6 +4065,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3968 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4065 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3969 u64 to_reserve; 4066 u64 to_reserve;
3970 int nr_extents; 4067 int nr_extents;
4068 int reserved_extents;
3971 int ret; 4069 int ret;
3972 4070
3973 if (btrfs_transaction_in_commit(root->fs_info)) 4071 if (btrfs_transaction_in_commit(root->fs_info))
@@ -3975,26 +4073,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3975 4073
3976 num_bytes = ALIGN(num_bytes, root->sectorsize); 4074 num_bytes = ALIGN(num_bytes, root->sectorsize);
3977 4075
3978 spin_lock(&BTRFS_I(inode)->accounting_lock);
3979 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4076 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3980 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 4077 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
3981 nr_extents -= BTRFS_I(inode)->reserved_extents; 4078
4079 if (nr_extents > reserved_extents) {
4080 nr_extents -= reserved_extents;
3982 to_reserve = calc_trans_metadata_size(root, nr_extents); 4081 to_reserve = calc_trans_metadata_size(root, nr_extents);
3983 } else { 4082 } else {
3984 nr_extents = 0; 4083 nr_extents = 0;
3985 to_reserve = 0; 4084 to_reserve = 0;
3986 } 4085 }
3987 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3988 4086
3989 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4087 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3990 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4088 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3991 if (ret) 4089 if (ret)
3992 return ret; 4090 return ret;
3993 4091
3994 spin_lock(&BTRFS_I(inode)->accounting_lock); 4092 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3995 BTRFS_I(inode)->reserved_extents += nr_extents;
3996 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 4093 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3997 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3998 4094
3999 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4095 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4000 4096
@@ -4009,19 +4105,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4009 struct btrfs_root *root = BTRFS_I(inode)->root; 4105 struct btrfs_root *root = BTRFS_I(inode)->root;
4010 u64 to_free; 4106 u64 to_free;
4011 int nr_extents; 4107 int nr_extents;
4108 int reserved_extents;
4012 4109
4013 num_bytes = ALIGN(num_bytes, root->sectorsize); 4110 num_bytes = ALIGN(num_bytes, root->sectorsize);
4014 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4111 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4112 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4015 4113
4016 spin_lock(&BTRFS_I(inode)->accounting_lock); 4114 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4017 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4115 do {
4018 if (nr_extents < BTRFS_I(inode)->reserved_extents) { 4116 int old, new;
4019 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; 4117
4020 BTRFS_I(inode)->reserved_extents -= nr_extents; 4118 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4021 } else { 4119 if (nr_extents >= reserved_extents) {
4022 nr_extents = 0; 4120 nr_extents = 0;
4023 } 4121 break;
4024 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4122 }
4123 old = reserved_extents;
4124 nr_extents = reserved_extents - nr_extents;
4125 new = reserved_extents - nr_extents;
4126 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4127 reserved_extents, new);
4128 if (likely(old == reserved_extents))
4129 break;
4130 reserved_extents = old;
4131 } while (1);
4025 4132
4026 to_free = calc_csum_metadata_size(inode, num_bytes); 4133 to_free = calc_csum_metadata_size(inode, num_bytes);
4027 if (nr_extents > 0) 4134 if (nr_extents > 0)
@@ -4112,6 +4219,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4112 btrfs_set_block_group_used(&cache->item, old_val); 4219 btrfs_set_block_group_used(&cache->item, old_val);
4113 cache->reserved -= num_bytes; 4220 cache->reserved -= num_bytes;
4114 cache->space_info->bytes_reserved -= num_bytes; 4221 cache->space_info->bytes_reserved -= num_bytes;
4222 cache->space_info->reservation_progress++;
4115 cache->space_info->bytes_used += num_bytes; 4223 cache->space_info->bytes_used += num_bytes;
4116 cache->space_info->disk_used += num_bytes * factor; 4224 cache->space_info->disk_used += num_bytes * factor;
4117 spin_unlock(&cache->lock); 4225 spin_unlock(&cache->lock);
@@ -4163,6 +4271,7 @@ static int pin_down_extent(struct btrfs_root *root,
4163 if (reserved) { 4271 if (reserved) {
4164 cache->reserved -= num_bytes; 4272 cache->reserved -= num_bytes;
4165 cache->space_info->bytes_reserved -= num_bytes; 4273 cache->space_info->bytes_reserved -= num_bytes;
4274 cache->space_info->reservation_progress++;
4166 } 4275 }
4167 spin_unlock(&cache->lock); 4276 spin_unlock(&cache->lock);
4168 spin_unlock(&cache->space_info->lock); 4277 spin_unlock(&cache->space_info->lock);
@@ -4193,8 +4302,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
4193 * update size of reserved extents. this function may return -EAGAIN 4302 * update size of reserved extents. this function may return -EAGAIN
4194 * if 'reserve' is true or 'sinfo' is false. 4303 * if 'reserve' is true or 'sinfo' is false.
4195 */ 4304 */
4196static int update_reserved_bytes(struct btrfs_block_group_cache *cache, 4305int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4197 u64 num_bytes, int reserve, int sinfo) 4306 u64 num_bytes, int reserve, int sinfo)
4198{ 4307{
4199 int ret = 0; 4308 int ret = 0;
4200 if (sinfo) { 4309 if (sinfo) {
@@ -4213,6 +4322,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4213 space_info->bytes_readonly += num_bytes; 4322 space_info->bytes_readonly += num_bytes;
4214 cache->reserved -= num_bytes; 4323 cache->reserved -= num_bytes;
4215 space_info->bytes_reserved -= num_bytes; 4324 space_info->bytes_reserved -= num_bytes;
4325 space_info->reservation_progress++;
4216 } 4326 }
4217 spin_unlock(&cache->lock); 4327 spin_unlock(&cache->lock);
4218 spin_unlock(&space_info->lock); 4328 spin_unlock(&space_info->lock);
@@ -4332,7 +4442,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4332 if (ret) 4442 if (ret)
4333 break; 4443 break;
4334 4444
4335 ret = btrfs_discard_extent(root, start, end + 1 - start); 4445 if (btrfs_test_opt(root, DISCARD))
4446 ret = btrfs_discard_extent(root, start,
4447 end + 1 - start, NULL);
4336 4448
4337 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4449 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4338 unpin_extent_range(root, start, end); 4450 unpin_extent_range(root, start, end);
@@ -4673,10 +4785,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4673 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4785 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4674 4786
4675 btrfs_add_free_space(cache, buf->start, buf->len); 4787 btrfs_add_free_space(cache, buf->start, buf->len);
4676 ret = update_reserved_bytes(cache, buf->len, 0, 0); 4788 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
4677 if (ret == -EAGAIN) { 4789 if (ret == -EAGAIN) {
4678 /* block group became read-only */ 4790 /* block group became read-only */
4679 update_reserved_bytes(cache, buf->len, 0, 1); 4791 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4680 goto out; 4792 goto out;
4681 } 4793 }
4682 4794
@@ -4691,6 +4803,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4691 if (ret) { 4803 if (ret) {
4692 spin_lock(&cache->space_info->lock); 4804 spin_lock(&cache->space_info->lock);
4693 cache->space_info->bytes_reserved -= buf->len; 4805 cache->space_info->bytes_reserved -= buf->len;
4806 cache->space_info->reservation_progress++;
4694 spin_unlock(&cache->space_info->lock); 4807 spin_unlock(&cache->space_info->lock);
4695 } 4808 }
4696 goto out; 4809 goto out;
@@ -4712,6 +4825,11 @@ pin:
4712 } 4825 }
4713 } 4826 }
4714out: 4827out:
4828 /*
4829 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4830 * anymore.
4831 */
4832 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4715 btrfs_put_block_group(cache); 4833 btrfs_put_block_group(cache);
4716} 4834}
4717 4835
@@ -5159,7 +5277,7 @@ checks:
5159 search_start - offset); 5277 search_start - offset);
5160 BUG_ON(offset > search_start); 5278 BUG_ON(offset > search_start);
5161 5279
5162 ret = update_reserved_bytes(block_group, num_bytes, 1, 5280 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
5163 (data & BTRFS_BLOCK_GROUP_DATA)); 5281 (data & BTRFS_BLOCK_GROUP_DATA));
5164 if (ret == -EAGAIN) { 5282 if (ret == -EAGAIN) {
5165 btrfs_add_free_space(block_group, offset, num_bytes); 5283 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -5250,11 +5368,13 @@ loop:
5250 5368
5251 if (allowed_chunk_alloc) { 5369 if (allowed_chunk_alloc) {
5252 ret = do_chunk_alloc(trans, root, num_bytes + 5370 ret = do_chunk_alloc(trans, root, num_bytes +
5253 2 * 1024 * 1024, data, 1); 5371 2 * 1024 * 1024, data,
5372 CHUNK_ALLOC_LIMITED);
5254 allowed_chunk_alloc = 0; 5373 allowed_chunk_alloc = 0;
5255 done_chunk_alloc = 1; 5374 done_chunk_alloc = 1;
5256 } else if (!done_chunk_alloc) { 5375 } else if (!done_chunk_alloc &&
5257 space_info->force_alloc = 1; 5376 space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
5377 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5258 } 5378 }
5259 5379
5260 if (loop < LOOP_NO_EMPTY_SIZE) { 5380 if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5340,7 +5460,8 @@ again:
5340 */ 5460 */
5341 if (empty_size || root->ref_cows) 5461 if (empty_size || root->ref_cows)
5342 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5462 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5343 num_bytes + 2 * 1024 * 1024, data, 0); 5463 num_bytes + 2 * 1024 * 1024, data,
5464 CHUNK_ALLOC_NO_FORCE);
5344 5465
5345 WARN_ON(num_bytes < root->sectorsize); 5466 WARN_ON(num_bytes < root->sectorsize);
5346 ret = find_free_extent(trans, root, num_bytes, empty_size, 5467 ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5352,10 +5473,10 @@ again:
5352 num_bytes = num_bytes & ~(root->sectorsize - 1); 5473 num_bytes = num_bytes & ~(root->sectorsize - 1);
5353 num_bytes = max(num_bytes, min_alloc_size); 5474 num_bytes = max(num_bytes, min_alloc_size);
5354 do_chunk_alloc(trans, root->fs_info->extent_root, 5475 do_chunk_alloc(trans, root->fs_info->extent_root,
5355 num_bytes, data, 1); 5476 num_bytes, data, CHUNK_ALLOC_FORCE);
5356 goto again; 5477 goto again;
5357 } 5478 }
5358 if (ret == -ENOSPC) { 5479 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5359 struct btrfs_space_info *sinfo; 5480 struct btrfs_space_info *sinfo;
5360 5481
5361 sinfo = __find_space_info(root->fs_info, data); 5482 sinfo = __find_space_info(root->fs_info, data);
@@ -5365,6 +5486,8 @@ again:
5365 dump_space_info(sinfo, num_bytes, 1); 5486 dump_space_info(sinfo, num_bytes, 1);
5366 } 5487 }
5367 5488
5489 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5490
5368 return ret; 5491 return ret;
5369} 5492}
5370 5493
@@ -5380,12 +5503,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5380 return -ENOSPC; 5503 return -ENOSPC;
5381 } 5504 }
5382 5505
5383 ret = btrfs_discard_extent(root, start, len); 5506 if (btrfs_test_opt(root, DISCARD))
5507 ret = btrfs_discard_extent(root, start, len, NULL);
5384 5508
5385 btrfs_add_free_space(cache, start, len); 5509 btrfs_add_free_space(cache, start, len);
5386 update_reserved_bytes(cache, len, 0, 1); 5510 btrfs_update_reserved_bytes(cache, len, 0, 1);
5387 btrfs_put_block_group(cache); 5511 btrfs_put_block_group(cache);
5388 5512
5513 trace_btrfs_reserved_extent_free(root, start, len);
5514
5389 return ret; 5515 return ret;
5390} 5516}
5391 5517
@@ -5412,7 +5538,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5412 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 5538 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5413 5539
5414 path = btrfs_alloc_path(); 5540 path = btrfs_alloc_path();
5415 BUG_ON(!path); 5541 if (!path)
5542 return -ENOMEM;
5416 5543
5417 path->leave_spinning = 1; 5544 path->leave_spinning = 1;
5418 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5545 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5582,7 +5709,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5582 put_caching_control(caching_ctl); 5709 put_caching_control(caching_ctl);
5583 } 5710 }
5584 5711
5585 ret = update_reserved_bytes(block_group, ins->offset, 1, 1); 5712 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
5586 BUG_ON(ret); 5713 BUG_ON(ret);
5587 btrfs_put_block_group(block_group); 5714 btrfs_put_block_group(block_group);
5588 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5715 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5633,6 +5760,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5633 struct btrfs_root *root, u32 blocksize) 5760 struct btrfs_root *root, u32 blocksize)
5634{ 5761{
5635 struct btrfs_block_rsv *block_rsv; 5762 struct btrfs_block_rsv *block_rsv;
5763 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5636 int ret; 5764 int ret;
5637 5765
5638 block_rsv = get_block_rsv(trans, root); 5766 block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5768,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5640 if (block_rsv->size == 0) { 5768 if (block_rsv->size == 0) {
5641 ret = reserve_metadata_bytes(trans, root, block_rsv, 5769 ret = reserve_metadata_bytes(trans, root, block_rsv,
5642 blocksize, 0); 5770 blocksize, 0);
5643 if (ret) 5771 /*
5772 * If we couldn't reserve metadata bytes try and use some from
5773 * the global reserve.
5774 */
5775 if (ret && block_rsv != global_rsv) {
5776 ret = block_rsv_use_bytes(global_rsv, blocksize);
5777 if (!ret)
5778 return global_rsv;
5779 return ERR_PTR(ret);
5780 } else if (ret) {
5644 return ERR_PTR(ret); 5781 return ERR_PTR(ret);
5782 }
5645 return block_rsv; 5783 return block_rsv;
5646 } 5784 }
5647 5785
5648 ret = block_rsv_use_bytes(block_rsv, blocksize); 5786 ret = block_rsv_use_bytes(block_rsv, blocksize);
5649 if (!ret) 5787 if (!ret)
5650 return block_rsv; 5788 return block_rsv;
5789 if (ret) {
5790 WARN_ON(1);
5791 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5792 0);
5793 if (!ret) {
5794 spin_lock(&block_rsv->lock);
5795 block_rsv->size += blocksize;
5796 spin_unlock(&block_rsv->lock);
5797 return block_rsv;
5798 } else if (ret && block_rsv != global_rsv) {
5799 ret = block_rsv_use_bytes(global_rsv, blocksize);
5800 if (!ret)
5801 return global_rsv;
5802 }
5803 }
5651 5804
5652 return ERR_PTR(-ENOSPC); 5805 return ERR_PTR(-ENOSPC);
5653} 5806}
@@ -5989,6 +6142,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5989 if (reada && level == 1) 6142 if (reada && level == 1)
5990 reada_walk_down(trans, root, wc, path); 6143 reada_walk_down(trans, root, wc, path);
5991 next = read_tree_block(root, bytenr, blocksize, generation); 6144 next = read_tree_block(root, bytenr, blocksize, generation);
6145 if (!next)
6146 return -EIO;
5992 btrfs_tree_lock(next); 6147 btrfs_tree_lock(next);
5993 btrfs_set_lock_blocking(next); 6148 btrfs_set_lock_blocking(next);
5994 } 6149 }
@@ -6221,6 +6376,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6221 BUG_ON(!wc); 6376 BUG_ON(!wc);
6222 6377
6223 trans = btrfs_start_transaction(tree_root, 0); 6378 trans = btrfs_start_transaction(tree_root, 0);
6379 BUG_ON(IS_ERR(trans));
6380
6224 if (block_rsv) 6381 if (block_rsv)
6225 trans->block_rsv = block_rsv; 6382 trans->block_rsv = block_rsv;
6226 6383
@@ -6318,6 +6475,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6318 6475
6319 btrfs_end_transaction_throttle(trans, tree_root); 6476 btrfs_end_transaction_throttle(trans, tree_root);
6320 trans = btrfs_start_transaction(tree_root, 0); 6477 trans = btrfs_start_transaction(tree_root, 0);
6478 BUG_ON(IS_ERR(trans));
6321 if (block_rsv) 6479 if (block_rsv)
6322 trans->block_rsv = block_rsv; 6480 trans->block_rsv = block_rsv;
6323 } 6481 }
@@ -6377,10 +6535,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6377 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6535 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6378 6536
6379 path = btrfs_alloc_path(); 6537 path = btrfs_alloc_path();
6380 BUG_ON(!path); 6538 if (!path)
6539 return -ENOMEM;
6381 6540
6382 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6541 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6383 BUG_ON(!wc); 6542 if (!wc) {
6543 btrfs_free_path(path);
6544 return -ENOMEM;
6545 }
6384 6546
6385 btrfs_assert_tree_locked(parent); 6547 btrfs_assert_tree_locked(parent);
6386 parent_level = btrfs_header_level(parent); 6548 parent_level = btrfs_header_level(parent);
@@ -6446,6 +6608,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6446 int ret = 0; 6608 int ret = 0;
6447 6609
6448 ra = kzalloc(sizeof(*ra), GFP_NOFS); 6610 ra = kzalloc(sizeof(*ra), GFP_NOFS);
6611 if (!ra)
6612 return -ENOMEM;
6449 6613
6450 mutex_lock(&inode->i_mutex); 6614 mutex_lock(&inode->i_mutex);
6451 first_index = start >> PAGE_CACHE_SHIFT; 6615 first_index = start >> PAGE_CACHE_SHIFT;
@@ -6531,7 +6695,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
6531 u64 end = start + extent_key->offset - 1; 6695 u64 end = start + extent_key->offset - 1;
6532 6696
6533 em = alloc_extent_map(GFP_NOFS); 6697 em = alloc_extent_map(GFP_NOFS);
6534 BUG_ON(!em || IS_ERR(em)); 6698 BUG_ON(!em);
6535 6699
6536 em->start = start; 6700 em->start = start;
6537 em->len = extent_key->offset; 6701 em->len = extent_key->offset;
@@ -6836,7 +7000,11 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6836 } 7000 }
6837 7001
6838 path = btrfs_alloc_path(); 7002 path = btrfs_alloc_path();
6839 BUG_ON(!path); 7003 if (!path) {
7004 if (exts != *extents)
7005 kfree(exts);
7006 return -ENOMEM;
7007 }
6840 7008
6841 cur_pos = extent_key->objectid - offset; 7009 cur_pos = extent_key->objectid - offset;
6842 last_byte = extent_key->objectid + extent_key->offset; 7010 last_byte = extent_key->objectid + extent_key->offset;
@@ -6878,6 +7046,10 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6878 struct disk_extent *old = exts; 7046 struct disk_extent *old = exts;
6879 max *= 2; 7047 max *= 2;
6880 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 7048 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
7049 if (!exts) {
7050 ret = -ENOMEM;
7051 goto out;
7052 }
6881 memcpy(exts, old, sizeof(*exts) * nr); 7053 memcpy(exts, old, sizeof(*exts) * nr);
6882 if (old != *extents) 7054 if (old != *extents)
6883 kfree(old); 7055 kfree(old);
@@ -7360,7 +7532,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7360 int ret; 7532 int ret;
7361 7533
7362 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 7534 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7363 BUG_ON(!new_extent); 7535 if (!new_extent)
7536 return -ENOMEM;
7364 7537
7365 ref = btrfs_lookup_leaf_ref(root, leaf->start); 7538 ref = btrfs_lookup_leaf_ref(root, leaf->start);
7366 BUG_ON(!ref); 7539 BUG_ON(!ref);
@@ -7477,7 +7650,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7477 BUG_ON(reloc_root->commit_root != NULL); 7650 BUG_ON(reloc_root->commit_root != NULL);
7478 while (1) { 7651 while (1) {
7479 trans = btrfs_join_transaction(root, 1); 7652 trans = btrfs_join_transaction(root, 1);
7480 BUG_ON(!trans); 7653 BUG_ON(IS_ERR(trans));
7481 7654
7482 mutex_lock(&root->fs_info->drop_mutex); 7655 mutex_lock(&root->fs_info->drop_mutex);
7483 ret = btrfs_drop_snapshot(trans, reloc_root); 7656 ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7708,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7535 7708
7536 if (found) { 7709 if (found) {
7537 trans = btrfs_start_transaction(root, 1); 7710 trans = btrfs_start_transaction(root, 1);
7538 BUG_ON(!trans); 7711 BUG_ON(IS_ERR(trans));
7539 ret = btrfs_commit_transaction(trans, root); 7712 ret = btrfs_commit_transaction(trans, root);
7540 BUG_ON(ret); 7713 BUG_ON(ret);
7541 } 7714 }
@@ -7546,7 +7719,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7546 7719
7547 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 7720 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7548 BUG_ON(!reloc_root); 7721 BUG_ON(!reloc_root);
7549 btrfs_orphan_cleanup(reloc_root); 7722 ret = btrfs_orphan_cleanup(reloc_root);
7723 BUG_ON(ret);
7550 return 0; 7724 return 0;
7551} 7725}
7552 7726
@@ -7564,7 +7738,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7564 return 0; 7738 return 0;
7565 7739
7566 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 7740 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7567 BUG_ON(!root_item); 7741 if (!root_item)
7742 return -ENOMEM;
7568 7743
7569 ret = btrfs_copy_root(trans, root, root->commit_root, 7744 ret = btrfs_copy_root(trans, root, root->commit_root,
7570 &eb, BTRFS_TREE_RELOC_OBJECTID); 7745 &eb, BTRFS_TREE_RELOC_OBJECTID);
@@ -7590,7 +7765,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7590 7765
7591 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 7766 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7592 &root_key); 7767 &root_key);
7593 BUG_ON(!reloc_root); 7768 BUG_ON(IS_ERR(reloc_root));
7594 reloc_root->last_trans = trans->transid; 7769 reloc_root->last_trans = trans->transid;
7595 reloc_root->commit_root = NULL; 7770 reloc_root->commit_root = NULL;
7596 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 7771 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
@@ -7779,7 +7954,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7779 7954
7780 7955
7781 trans = btrfs_start_transaction(extent_root, 1); 7956 trans = btrfs_start_transaction(extent_root, 1);
7782 BUG_ON(!trans); 7957 BUG_ON(IS_ERR(trans));
7783 7958
7784 if (extent_key->objectid == 0) { 7959 if (extent_key->objectid == 0) {
7785 ret = del_extent_zero(trans, extent_root, path, extent_key); 7960 ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -7843,6 +8018,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7843 8018
7844 eb = read_tree_block(found_root, block_start, 8019 eb = read_tree_block(found_root, block_start,
7845 block_size, 0); 8020 block_size, 0);
8021 if (!eb) {
8022 ret = -EIO;
8023 goto out;
8024 }
7846 btrfs_tree_lock(eb); 8025 btrfs_tree_lock(eb);
7847 BUG_ON(level != btrfs_header_level(eb)); 8026 BUG_ON(level != btrfs_header_level(eb));
7848 8027
@@ -7970,13 +8149,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7970 8149
7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8150 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7972 sinfo->bytes_may_use + sinfo->bytes_readonly + 8151 sinfo->bytes_may_use + sinfo->bytes_readonly +
7973 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 8152 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7974 sinfo->bytes_readonly += num_bytes; 8153 sinfo->bytes_readonly += num_bytes;
7975 sinfo->bytes_reserved += cache->reserved_pinned; 8154 sinfo->bytes_reserved += cache->reserved_pinned;
7976 cache->reserved_pinned = 0; 8155 cache->reserved_pinned = 0;
7977 cache->ro = 1; 8156 cache->ro = 1;
7978 ret = 0; 8157 ret = 0;
7979 } 8158 }
8159
7980 spin_unlock(&cache->lock); 8160 spin_unlock(&cache->lock);
7981 spin_unlock(&sinfo->lock); 8161 spin_unlock(&sinfo->lock);
7982 return ret; 8162 return ret;
@@ -7997,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7997 8177
7998 alloc_flags = update_block_group_flags(root, cache->flags); 8178 alloc_flags = update_block_group_flags(root, cache->flags);
7999 if (alloc_flags != cache->flags) 8179 if (alloc_flags != cache->flags)
8000 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8180 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8181 CHUNK_ALLOC_FORCE);
8001 8182
8002 ret = set_block_group_ro(cache); 8183 ret = set_block_group_ro(cache);
8003 if (!ret) 8184 if (!ret)
8004 goto out; 8185 goto out;
8005 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8186 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8006 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8187 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8188 CHUNK_ALLOC_FORCE);
8007 if (ret < 0) 8189 if (ret < 0)
8008 goto out; 8190 goto out;
8009 ret = set_block_group_ro(cache); 8191 ret = set_block_group_ro(cache);
@@ -8012,6 +8194,70 @@ out:
8012 return ret; 8194 return ret;
8013} 8195}
8014 8196
8197int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8198 struct btrfs_root *root, u64 type)
8199{
8200 u64 alloc_flags = get_alloc_profile(root, type);
8201 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8202 CHUNK_ALLOC_FORCE);
8203}
8204
8205/*
8206 * helper to account the unused space of all the readonly block group in the
8207 * list. takes mirrors into account.
8208 */
8209static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8210{
8211 struct btrfs_block_group_cache *block_group;
8212 u64 free_bytes = 0;
8213 int factor;
8214
8215 list_for_each_entry(block_group, groups_list, list) {
8216 spin_lock(&block_group->lock);
8217
8218 if (!block_group->ro) {
8219 spin_unlock(&block_group->lock);
8220 continue;
8221 }
8222
8223 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8224 BTRFS_BLOCK_GROUP_RAID10 |
8225 BTRFS_BLOCK_GROUP_DUP))
8226 factor = 2;
8227 else
8228 factor = 1;
8229
8230 free_bytes += (block_group->key.offset -
8231 btrfs_block_group_used(&block_group->item)) *
8232 factor;
8233
8234 spin_unlock(&block_group->lock);
8235 }
8236
8237 return free_bytes;
8238}
8239
8240/*
8241 * helper to account the unused space of all the readonly block group in the
8242 * space_info. takes mirrors into account.
8243 */
8244u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8245{
8246 int i;
8247 u64 free_bytes = 0;
8248
8249 spin_lock(&sinfo->lock);
8250
8251 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8252 if (!list_empty(&sinfo->block_groups[i]))
8253 free_bytes += __btrfs_get_ro_block_group_free_space(
8254 &sinfo->block_groups[i]);
8255
8256 spin_unlock(&sinfo->lock);
8257
8258 return free_bytes;
8259}
8260
8015int btrfs_set_block_group_rw(struct btrfs_root *root, 8261int btrfs_set_block_group_rw(struct btrfs_root *root,
8016 struct btrfs_block_group_cache *cache) 8262 struct btrfs_block_group_cache *cache)
8017{ 8263{
@@ -8092,7 +8338,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8092 mutex_lock(&root->fs_info->chunk_mutex); 8338 mutex_lock(&root->fs_info->chunk_mutex);
8093 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8339 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8094 u64 min_free = btrfs_block_group_used(&block_group->item); 8340 u64 min_free = btrfs_block_group_used(&block_group->item);
8095 u64 dev_offset, max_avail; 8341 u64 dev_offset;
8096 8342
8097 /* 8343 /*
8098 * check to make sure we can actually find a chunk with enough 8344 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8346,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8100 */ 8346 */
8101 if (device->total_bytes > device->bytes_used + min_free) { 8347 if (device->total_bytes > device->bytes_used + min_free) {
8102 ret = find_free_dev_extent(NULL, device, min_free, 8348 ret = find_free_dev_extent(NULL, device, min_free,
8103 &dev_offset, &max_avail); 8349 &dev_offset, NULL);
8104 if (!ret) 8350 if (!ret)
8105 break; 8351 break;
8106 ret = -1; 8352 ret = -1;
@@ -8213,6 +8459,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8213 if (block_group->cached == BTRFS_CACHE_STARTED) 8459 if (block_group->cached == BTRFS_CACHE_STARTED)
8214 wait_block_group_cache_done(block_group); 8460 wait_block_group_cache_done(block_group);
8215 8461
8462 /*
8463 * We haven't cached this block group, which means we could
8464 * possibly have excluded extents on this block group.
8465 */
8466 if (block_group->cached == BTRFS_CACHE_NO)
8467 free_excluded_extents(info->extent_root, block_group);
8468
8216 btrfs_remove_free_space_cache(block_group); 8469 btrfs_remove_free_space_cache(block_group);
8217 btrfs_put_block_group(block_group); 8470 btrfs_put_block_group(block_group);
8218 8471
@@ -8328,6 +8581,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8328 cache->sectorsize = root->sectorsize; 8581 cache->sectorsize = root->sectorsize;
8329 8582
8330 /* 8583 /*
8584 * We need to exclude the super stripes now so that the space
8585 * info has super bytes accounted for, otherwise we'll think
8586 * we have more space than we actually do.
8587 */
8588 exclude_super_stripes(root, cache);
8589
8590 /*
8331 * check for two cases, either we are full, and therefore 8591 * check for two cases, either we are full, and therefore
8332 * don't need to bother with the caching work since we won't 8592 * don't need to bother with the caching work since we won't
8333 * find any space, or we are empty, and we can just add all 8593 * find any space, or we are empty, and we can just add all
@@ -8335,12 +8595,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8335 * time, particularly in the full case. 8595 * time, particularly in the full case.
8336 */ 8596 */
8337 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8597 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8338 exclude_super_stripes(root, cache);
8339 cache->last_byte_to_unpin = (u64)-1; 8598 cache->last_byte_to_unpin = (u64)-1;
8340 cache->cached = BTRFS_CACHE_FINISHED; 8599 cache->cached = BTRFS_CACHE_FINISHED;
8341 free_excluded_extents(root, cache); 8600 free_excluded_extents(root, cache);
8342 } else if (btrfs_block_group_used(&cache->item) == 0) { 8601 } else if (btrfs_block_group_used(&cache->item) == 0) {
8343 exclude_super_stripes(root, cache);
8344 cache->last_byte_to_unpin = (u64)-1; 8602 cache->last_byte_to_unpin = (u64)-1;
8345 cache->cached = BTRFS_CACHE_FINISHED; 8603 cache->cached = BTRFS_CACHE_FINISHED;
8346 add_new_free_space(cache, root->fs_info, 8604 add_new_free_space(cache, root->fs_info,
@@ -8482,6 +8740,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8482 BUG_ON(!block_group); 8740 BUG_ON(!block_group);
8483 BUG_ON(!block_group->ro); 8741 BUG_ON(!block_group->ro);
8484 8742
8743 /*
8744 * Free the reserved super bytes from this block group before
8745 * remove it.
8746 */
8747 free_excluded_extents(root, block_group);
8748
8485 memcpy(&key, &block_group->key, sizeof(key)); 8749 memcpy(&key, &block_group->key, sizeof(key));
8486 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8750 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8487 BTRFS_BLOCK_GROUP_RAID1 | 8751 BTRFS_BLOCK_GROUP_RAID1 |
@@ -8584,3 +8848,85 @@ out:
8584 btrfs_free_path(path); 8848 btrfs_free_path(path);
8585 return ret; 8849 return ret;
8586} 8850}
8851
8852int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8853{
8854 struct btrfs_space_info *space_info;
8855 int ret;
8856
8857 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0,
8858 &space_info);
8859 if (ret)
8860 return ret;
8861
8862 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0,
8863 &space_info);
8864 if (ret)
8865 return ret;
8866
8867 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0,
8868 &space_info);
8869 if (ret)
8870 return ret;
8871
8872 return ret;
8873}
8874
8875int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8876{
8877 return unpin_extent_range(root, start, end);
8878}
8879
8880int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8881 u64 num_bytes, u64 *actual_bytes)
8882{
8883 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8884}
8885
8886int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8887{
8888 struct btrfs_fs_info *fs_info = root->fs_info;
8889 struct btrfs_block_group_cache *cache = NULL;
8890 u64 group_trimmed;
8891 u64 start;
8892 u64 end;
8893 u64 trimmed = 0;
8894 int ret = 0;
8895
8896 cache = btrfs_lookup_block_group(fs_info, range->start);
8897
8898 while (cache) {
8899 if (cache->key.objectid >= (range->start + range->len)) {
8900 btrfs_put_block_group(cache);
8901 break;
8902 }
8903
8904 start = max(range->start, cache->key.objectid);
8905 end = min(range->start + range->len,
8906 cache->key.objectid + cache->key.offset);
8907
8908 if (end - start >= range->minlen) {
8909 if (!block_group_cache_done(cache)) {
8910 ret = cache_block_group(cache, NULL, root, 0);
8911 if (!ret)
8912 wait_block_group_cache_done(cache);
8913 }
8914 ret = btrfs_trim_block_group(cache,
8915 &group_trimmed,
8916 start,
8917 end,
8918 range->minlen);
8919
8920 trimmed += group_trimmed;
8921 if (ret) {
8922 btrfs_put_block_group(cache);
8923 break;
8924 }
8925 }
8926
8927 cache = next_block_group(fs_info->tree_root, cache);
8928 }
8929
8930 range->len = trimmed;
8931 return ret;
8932}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..315138605088 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
690 } 690 }
691} 691}
692 692
693static void uncache_state(struct extent_state **cached_ptr)
694{
695 if (cached_ptr && (*cached_ptr)) {
696 struct extent_state *state = *cached_ptr;
697 *cached_ptr = NULL;
698 free_extent_state(state);
699 }
700}
701
693/* 702/*
694 * set some bits on a range in the tree. This may require allocations or 703 * set some bits on a range in the tree. This may require allocations or
695 * sleeping, so the gfp mask is used to indicate what is allowed. 704 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
940} 949}
941 950
942int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 951int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
943 gfp_t mask) 952 struct extent_state **cached_state, gfp_t mask)
944{ 953{
945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 954 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
946 NULL, mask); 955 NULL, cached_state, mask);
947} 956}
948 957
949static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 958static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1012 mask); 1021 mask);
1013} 1022}
1014 1023
1015int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1024int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1016 gfp_t mask)
1017{ 1025{
1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1026 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1019 mask); 1027 mask);
@@ -1433,12 +1441,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1433 */ 1441 */
1434u64 count_range_bits(struct extent_io_tree *tree, 1442u64 count_range_bits(struct extent_io_tree *tree,
1435 u64 *start, u64 search_end, u64 max_bytes, 1443 u64 *start, u64 search_end, u64 max_bytes,
1436 unsigned long bits) 1444 unsigned long bits, int contig)
1437{ 1445{
1438 struct rb_node *node; 1446 struct rb_node *node;
1439 struct extent_state *state; 1447 struct extent_state *state;
1440 u64 cur_start = *start; 1448 u64 cur_start = *start;
1441 u64 total_bytes = 0; 1449 u64 total_bytes = 0;
1450 u64 last = 0;
1442 int found = 0; 1451 int found = 0;
1443 1452
1444 if (search_end <= cur_start) { 1453 if (search_end <= cur_start) {
@@ -1463,7 +1472,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1463 state = rb_entry(node, struct extent_state, rb_node); 1472 state = rb_entry(node, struct extent_state, rb_node);
1464 if (state->start > search_end) 1473 if (state->start > search_end)
1465 break; 1474 break;
1466 if (state->end >= cur_start && (state->state & bits)) { 1475 if (contig && found && state->start > last + 1)
1476 break;
1477 if (state->end >= cur_start && (state->state & bits) == bits) {
1467 total_bytes += min(search_end, state->end) + 1 - 1478 total_bytes += min(search_end, state->end) + 1 -
1468 max(cur_start, state->start); 1479 max(cur_start, state->start);
1469 if (total_bytes >= max_bytes) 1480 if (total_bytes >= max_bytes)
@@ -1472,6 +1483,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1472 *start = state->start; 1483 *start = state->start;
1473 found = 1; 1484 found = 1;
1474 } 1485 }
1486 last = state->end;
1487 } else if (contig && found) {
1488 break;
1475 } 1489 }
1476 node = rb_next(node); 1490 node = rb_next(node);
1477 if (!node) 1491 if (!node)
@@ -1729,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1729 1743
1730 do { 1744 do {
1731 struct page *page = bvec->bv_page; 1745 struct page *page = bvec->bv_page;
1746 struct extent_state *cached = NULL;
1747 struct extent_state *state;
1748
1732 tree = &BTRFS_I(page->mapping->host)->io_tree; 1749 tree = &BTRFS_I(page->mapping->host)->io_tree;
1733 1750
1734 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1751 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1743,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1743 if (++bvec <= bvec_end) 1760 if (++bvec <= bvec_end)
1744 prefetchw(&bvec->bv_page->flags); 1761 prefetchw(&bvec->bv_page->flags);
1745 1762
1763 spin_lock(&tree->lock);
1764 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1765 if (state && state->start == start) {
1766 /*
1767 * take a reference on the state, unlock will drop
1768 * the ref
1769 */
1770 cache_state(state, &cached);
1771 }
1772 spin_unlock(&tree->lock);
1773
1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1774 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1747 ret = tree->ops->readpage_end_io_hook(page, start, end, 1775 ret = tree->ops->readpage_end_io_hook(page, start, end,
1748 NULL); 1776 state);
1749 if (ret) 1777 if (ret)
1750 uptodate = 0; 1778 uptodate = 0;
1751 } 1779 }
@@ -1758,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1758 test_bit(BIO_UPTODATE, &bio->bi_flags); 1786 test_bit(BIO_UPTODATE, &bio->bi_flags);
1759 if (err) 1787 if (err)
1760 uptodate = 0; 1788 uptodate = 0;
1789 uncache_state(&cached);
1761 continue; 1790 continue;
1762 } 1791 }
1763 } 1792 }
1764 1793
1765 if (uptodate) { 1794 if (uptodate) {
1766 set_extent_uptodate(tree, start, end, 1795 set_extent_uptodate(tree, start, end, &cached,
1767 GFP_ATOMIC); 1796 GFP_ATOMIC);
1768 } 1797 }
1769 unlock_extent(tree, start, end, GFP_ATOMIC); 1798 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1770 1799
1771 if (whole_page) { 1800 if (whole_page) {
1772 if (uptodate) { 1801 if (uptodate) {
@@ -1805,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1805 1834
1806 do { 1835 do {
1807 struct page *page = bvec->bv_page; 1836 struct page *page = bvec->bv_page;
1837 struct extent_state *cached = NULL;
1808 tree = &BTRFS_I(page->mapping->host)->io_tree; 1838 tree = &BTRFS_I(page->mapping->host)->io_tree;
1809 1839
1810 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1840 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1815,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1815 prefetchw(&bvec->bv_page->flags); 1845 prefetchw(&bvec->bv_page->flags);
1816 1846
1817 if (uptodate) { 1847 if (uptodate) {
1818 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1848 set_extent_uptodate(tree, start, end, &cached,
1849 GFP_ATOMIC);
1819 } else { 1850 } else {
1820 ClearPageUptodate(page); 1851 ClearPageUptodate(page);
1821 SetPageError(page); 1852 SetPageError(page);
1822 } 1853 }
1823 1854
1824 unlock_extent(tree, start, end, GFP_ATOMIC); 1855 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1825 1856
1826 } while (bvec >= bio->bi_io_vec); 1857 } while (bvec >= bio->bi_io_vec);
1827 1858
@@ -1865,7 +1896,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1865 bio_get(bio); 1896 bio_get(bio);
1866 1897
1867 if (tree->ops && tree->ops->submit_bio_hook) 1898 if (tree->ops && tree->ops->submit_bio_hook)
1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1899 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1869 mirror_num, bio_flags, start); 1900 mirror_num, bio_flags, start);
1870 else 1901 else
1871 submit_bio(rw, bio); 1902 submit_bio(rw, bio);
@@ -1920,6 +1951,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1920 nr = bio_get_nr_vecs(bdev); 1951 nr = bio_get_nr_vecs(bdev);
1921 1952
1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1953 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1954 if (!bio)
1955 return -ENOMEM;
1923 1956
1924 bio_add_page(bio, page, page_size, offset); 1957 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1958 bio->bi_end_io = end_io_func;
@@ -1944,6 +1977,7 @@ void set_page_extent_mapped(struct page *page)
1944 1977
1945static void set_page_extent_head(struct page *page, unsigned long len) 1978static void set_page_extent_head(struct page *page, unsigned long len)
1946{ 1979{
1980 WARN_ON(!PagePrivate(page));
1947 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1981 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1948} 1982}
1949 1983
@@ -2007,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2007 while (cur <= end) { 2041 while (cur <= end) {
2008 if (cur >= last_byte) { 2042 if (cur >= last_byte) {
2009 char *userpage; 2043 char *userpage;
2044 struct extent_state *cached = NULL;
2045
2010 iosize = PAGE_CACHE_SIZE - page_offset; 2046 iosize = PAGE_CACHE_SIZE - page_offset;
2011 userpage = kmap_atomic(page, KM_USER0); 2047 userpage = kmap_atomic(page, KM_USER0);
2012 memset(userpage + page_offset, 0, iosize); 2048 memset(userpage + page_offset, 0, iosize);
2013 flush_dcache_page(page); 2049 flush_dcache_page(page);
2014 kunmap_atomic(userpage, KM_USER0); 2050 kunmap_atomic(userpage, KM_USER0);
2015 set_extent_uptodate(tree, cur, cur + iosize - 1, 2051 set_extent_uptodate(tree, cur, cur + iosize - 1,
2016 GFP_NOFS); 2052 &cached, GFP_NOFS);
2017 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2053 unlock_extent_cached(tree, cur, cur + iosize - 1,
2054 &cached, GFP_NOFS);
2018 break; 2055 break;
2019 } 2056 }
2020 em = get_extent(inode, page, page_offset, cur, 2057 em = get_extent(inode, page, page_offset, cur,
@@ -2028,8 +2065,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2065 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2066 BUG_ON(end < cur);
2030 2067
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2068 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2069 this_bio_flag = EXTENT_BIO_COMPRESSED;
2070 extent_set_compress_type(&this_bio_flag,
2071 em->compress_type);
2072 }
2033 2073
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2074 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2075 cur_end = min(extent_map_end(em) - 1, end);
@@ -2051,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2051 /* we've found a hole, just zero and go on */ 2091 /* we've found a hole, just zero and go on */
2052 if (block_start == EXTENT_MAP_HOLE) { 2092 if (block_start == EXTENT_MAP_HOLE) {
2053 char *userpage; 2093 char *userpage;
2094 struct extent_state *cached = NULL;
2095
2054 userpage = kmap_atomic(page, KM_USER0); 2096 userpage = kmap_atomic(page, KM_USER0);
2055 memset(userpage + page_offset, 0, iosize); 2097 memset(userpage + page_offset, 0, iosize);
2056 flush_dcache_page(page); 2098 flush_dcache_page(page);
2057 kunmap_atomic(userpage, KM_USER0); 2099 kunmap_atomic(userpage, KM_USER0);
2058 2100
2059 set_extent_uptodate(tree, cur, cur + iosize - 1, 2101 set_extent_uptodate(tree, cur, cur + iosize - 1,
2060 GFP_NOFS); 2102 &cached, GFP_NOFS);
2061 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2103 unlock_extent_cached(tree, cur, cur + iosize - 1,
2104 &cached, GFP_NOFS);
2062 cur = cur + iosize; 2105 cur = cur + iosize;
2063 page_offset += iosize; 2106 page_offset += iosize;
2064 continue; 2107 continue;
@@ -2123,7 +2166,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2123 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2166 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2124 &bio_flags); 2167 &bio_flags);
2125 if (bio) 2168 if (bio)
2126 submit_one_bio(READ, bio, 0, bio_flags); 2169 ret = submit_one_bio(READ, bio, 0, bio_flags);
2127 return ret; 2170 return ret;
2128} 2171}
2129 2172
@@ -2176,10 +2219,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2176 unsigned long nr_written = 0; 2219 unsigned long nr_written = 0;
2177 2220
2178 if (wbc->sync_mode == WB_SYNC_ALL) 2221 if (wbc->sync_mode == WB_SYNC_ALL)
2179 write_flags = WRITE_SYNC_PLUG; 2222 write_flags = WRITE_SYNC;
2180 else 2223 else
2181 write_flags = WRITE; 2224 write_flags = WRITE;
2182 2225
2226 trace___extent_writepage(page, inode, wbc);
2227
2183 WARN_ON(!PageLocked(page)); 2228 WARN_ON(!PageLocked(page));
2184 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2229 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2185 if (page->index > end_index || 2230 if (page->index > end_index ||
@@ -2775,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
2775 iocount++; 2820 iocount++;
2776 block_start = block_start + iosize; 2821 block_start = block_start + iosize;
2777 } else { 2822 } else {
2778 set_extent_uptodate(tree, block_start, cur_end, 2823 struct extent_state *cached = NULL;
2824
2825 set_extent_uptodate(tree, block_start, cur_end, &cached,
2779 GFP_NOFS); 2826 GFP_NOFS);
2780 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2827 unlock_extent_cached(tree, block_start, cur_end,
2828 &cached, GFP_NOFS);
2781 block_start = cur_end + 1; 2829 block_start = cur_end + 1;
2782 } 2830 }
2783 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2831 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -2816,9 +2864,17 @@ int try_release_extent_state(struct extent_map_tree *map,
2816 * at this point we can safely clear everything except the 2864 * at this point we can safely clear everything except the
2817 * locked bit and the nodatasum bit 2865 * locked bit and the nodatasum bit
2818 */ 2866 */
2819 clear_extent_bit(tree, start, end, 2867 ret = clear_extent_bit(tree, start, end,
2820 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2868 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2821 0, 0, NULL, mask); 2869 0, 0, NULL, mask);
2870
2871 /* if clear_extent_bit failed for enomem reasons,
2872 * we can't allow the release to continue.
2873 */
2874 if (ret < 0)
2875 ret = 0;
2876 else
2877 ret = 1;
2822 } 2878 }
2823 return ret; 2879 return ret;
2824} 2880}
@@ -2898,6 +2954,46 @@ out:
2898 return sector; 2954 return sector;
2899} 2955}
2900 2956
2957/*
2958 * helper function for fiemap, which doesn't want to see any holes.
2959 * This maps until we find something past 'last'
2960 */
2961static struct extent_map *get_extent_skip_holes(struct inode *inode,
2962 u64 offset,
2963 u64 last,
2964 get_extent_t *get_extent)
2965{
2966 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2967 struct extent_map *em;
2968 u64 len;
2969
2970 if (offset >= last)
2971 return NULL;
2972
2973 while(1) {
2974 len = last - offset;
2975 if (len == 0)
2976 break;
2977 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2978 em = get_extent(inode, NULL, 0, offset, len, 0);
2979 if (!em || IS_ERR(em))
2980 return em;
2981
2982 /* if this isn't a hole return it */
2983 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2984 em->block_start != EXTENT_MAP_HOLE) {
2985 return em;
2986 }
2987
2988 /* this is a hole, advance to the next extent */
2989 offset = extent_map_end(em);
2990 free_extent_map(em);
2991 if (offset >= last)
2992 break;
2993 }
2994 return NULL;
2995}
2996
2901int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2997int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2902 __u64 start, __u64 len, get_extent_t *get_extent) 2998 __u64 start, __u64 len, get_extent_t *get_extent)
2903{ 2999{
@@ -2907,16 +3003,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2907 u32 flags = 0; 3003 u32 flags = 0;
2908 u32 found_type; 3004 u32 found_type;
2909 u64 last; 3005 u64 last;
3006 u64 last_for_get_extent = 0;
2910 u64 disko = 0; 3007 u64 disko = 0;
3008 u64 isize = i_size_read(inode);
2911 struct btrfs_key found_key; 3009 struct btrfs_key found_key;
2912 struct extent_map *em = NULL; 3010 struct extent_map *em = NULL;
2913 struct extent_state *cached_state = NULL; 3011 struct extent_state *cached_state = NULL;
2914 struct btrfs_path *path; 3012 struct btrfs_path *path;
2915 struct btrfs_file_extent_item *item; 3013 struct btrfs_file_extent_item *item;
2916 int end = 0; 3014 int end = 0;
2917 u64 em_start = 0, em_len = 0; 3015 u64 em_start = 0;
3016 u64 em_len = 0;
3017 u64 em_end = 0;
2918 unsigned long emflags; 3018 unsigned long emflags;
2919 int hole = 0;
2920 3019
2921 if (len == 0) 3020 if (len == 0)
2922 return -EINVAL; 3021 return -EINVAL;
@@ -2926,6 +3025,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2926 return -ENOMEM; 3025 return -ENOMEM;
2927 path->leave_spinning = 1; 3026 path->leave_spinning = 1;
2928 3027
3028 /*
3029 * lookup the last file extent. We're not using i_size here
3030 * because there might be preallocation past i_size
3031 */
2929 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3032 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2930 path, inode->i_ino, -1, 0); 3033 path, inode->i_ino, -1, 0);
2931 if (ret < 0) { 3034 if (ret < 0) {
@@ -2939,18 +3042,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2939 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3042 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2940 found_type = btrfs_key_type(&found_key); 3043 found_type = btrfs_key_type(&found_key);
2941 3044
2942 /* No extents, just return */ 3045 /* No extents, but there might be delalloc bits */
2943 if (found_key.objectid != inode->i_ino || 3046 if (found_key.objectid != inode->i_ino ||
2944 found_type != BTRFS_EXTENT_DATA_KEY) { 3047 found_type != BTRFS_EXTENT_DATA_KEY) {
2945 btrfs_free_path(path); 3048 /* have to trust i_size as the end */
2946 return 0; 3049 last = (u64)-1;
3050 last_for_get_extent = isize;
3051 } else {
3052 /*
3053 * remember the start of the last extent. There are a
3054 * bunch of different factors that go into the length of the
3055 * extent, so its much less complex to remember where it started
3056 */
3057 last = found_key.offset;
3058 last_for_get_extent = last + 1;
2947 } 3059 }
2948 last = found_key.offset;
2949 btrfs_free_path(path); 3060 btrfs_free_path(path);
2950 3061
3062 /*
3063 * we might have some extents allocated but more delalloc past those
3064 * extents. so, we trust isize unless the start of the last extent is
3065 * beyond isize
3066 */
3067 if (last < isize) {
3068 last = (u64)-1;
3069 last_for_get_extent = isize;
3070 }
3071
2951 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3072 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2952 &cached_state, GFP_NOFS); 3073 &cached_state, GFP_NOFS);
2953 em = get_extent(inode, NULL, 0, off, max - off, 0); 3074
3075 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3076 get_extent);
2954 if (!em) 3077 if (!em)
2955 goto out; 3078 goto out;
2956 if (IS_ERR(em)) { 3079 if (IS_ERR(em)) {
@@ -2959,22 +3082,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 } 3082 }
2960 3083
2961 while (!end) { 3084 while (!end) {
2962 hole = 0; 3085 u64 offset_in_extent;
2963 off = em->start + em->len;
2964 if (off >= max)
2965 end = 1;
2966 3086
2967 if (em->block_start == EXTENT_MAP_HOLE) { 3087 /* break if the extent we found is outside the range */
2968 hole = 1; 3088 if (em->start >= max || extent_map_end(em) < off)
2969 goto next; 3089 break;
2970 }
2971 3090
2972 em_start = em->start; 3091 /*
2973 em_len = em->len; 3092 * get_extent may return an extent that starts before our
3093 * requested range. We have to make sure the ranges
3094 * we return to fiemap always move forward and don't
3095 * overlap, so adjust the offsets here
3096 */
3097 em_start = max(em->start, off);
2974 3098
3099 /*
3100 * record the offset from the start of the extent
3101 * for adjusting the disk offset below
3102 */
3103 offset_in_extent = em_start - em->start;
3104 em_end = extent_map_end(em);
3105 em_len = em_end - em_start;
3106 emflags = em->flags;
2975 disko = 0; 3107 disko = 0;
2976 flags = 0; 3108 flags = 0;
2977 3109
3110 /*
3111 * bump off for our next call to get_extent
3112 */
3113 off = extent_map_end(em);
3114 if (off >= max)
3115 end = 1;
3116
2978 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3117 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2979 end = 1; 3118 end = 1;
2980 flags |= FIEMAP_EXTENT_LAST; 3119 flags |= FIEMAP_EXTENT_LAST;
@@ -2985,42 +3124,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2985 flags |= (FIEMAP_EXTENT_DELALLOC | 3124 flags |= (FIEMAP_EXTENT_DELALLOC |
2986 FIEMAP_EXTENT_UNKNOWN); 3125 FIEMAP_EXTENT_UNKNOWN);
2987 } else { 3126 } else {
2988 disko = em->block_start; 3127 disko = em->block_start + offset_in_extent;
2989 } 3128 }
2990 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3129 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2991 flags |= FIEMAP_EXTENT_ENCODED; 3130 flags |= FIEMAP_EXTENT_ENCODED;
2992 3131
2993next:
2994 emflags = em->flags;
2995 free_extent_map(em); 3132 free_extent_map(em);
2996 em = NULL; 3133 em = NULL;
2997 if (!end) { 3134 if ((em_start >= last) || em_len == (u64)-1 ||
2998 em = get_extent(inode, NULL, 0, off, max - off, 0); 3135 (last == (u64)-1 && isize <= em_end)) {
2999 if (!em)
3000 goto out;
3001 if (IS_ERR(em)) {
3002 ret = PTR_ERR(em);
3003 goto out;
3004 }
3005 emflags = em->flags;
3006 }
3007
3008 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3009 flags |= FIEMAP_EXTENT_LAST; 3136 flags |= FIEMAP_EXTENT_LAST;
3010 end = 1; 3137 end = 1;
3011 } 3138 }
3012 3139
3013 if (em_start == last) { 3140 /* now scan forward to see if this is really the last extent. */
3141 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3142 get_extent);
3143 if (IS_ERR(em)) {
3144 ret = PTR_ERR(em);
3145 goto out;
3146 }
3147 if (!em) {
3014 flags |= FIEMAP_EXTENT_LAST; 3148 flags |= FIEMAP_EXTENT_LAST;
3015 end = 1; 3149 end = 1;
3016 } 3150 }
3017 3151 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3018 if (!hole) { 3152 em_len, flags);
3019 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3153 if (ret)
3020 em_len, flags); 3154 goto out_free;
3021 if (ret)
3022 goto out_free;
3023 }
3024 } 3155 }
3025out_free: 3156out_free:
3026 free_extent_map(em); 3157 free_extent_map(em);
@@ -3072,6 +3203,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3072#endif 3203#endif
3073 3204
3074 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3205 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3206 if (eb == NULL)
3207 return NULL;
3075 eb->start = start; 3208 eb->start = start;
3076 eb->len = len; 3209 eb->len = len;
3077 spin_lock_init(&eb->lock); 3210 spin_lock_init(&eb->lock);
@@ -3187,7 +3320,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3187 } 3320 }
3188 if (!PageUptodate(p)) 3321 if (!PageUptodate(p))
3189 uptodate = 0; 3322 uptodate = 0;
3190 unlock_page(p); 3323
3324 /*
3325 * see below about how we avoid a nasty race with release page
3326 * and why we unlock later
3327 */
3328 if (i != 0)
3329 unlock_page(p);
3191 } 3330 }
3192 if (uptodate) 3331 if (uptodate)
3193 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3332 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3211,9 +3350,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3211 atomic_inc(&eb->refs); 3350 atomic_inc(&eb->refs);
3212 spin_unlock(&tree->buffer_lock); 3351 spin_unlock(&tree->buffer_lock);
3213 radix_tree_preload_end(); 3352 radix_tree_preload_end();
3353
3354 /*
3355 * there is a race where release page may have
3356 * tried to find this extent buffer in the radix
3357 * but failed. It will tell the VM it is safe to
3358 * reclaim the, and it will clear the page private bit.
3359 * We must make sure to set the page private bit properly
3360 * after the extent buffer is in the radix tree so
3361 * it doesn't get lost
3362 */
3363 set_page_extent_mapped(eb->first_page);
3364 set_page_extent_head(eb->first_page, eb->len);
3365 if (!page0)
3366 unlock_page(eb->first_page);
3214 return eb; 3367 return eb;
3215 3368
3216free_eb: 3369free_eb:
3370 if (eb->first_page && !page0)
3371 unlock_page(eb->first_page);
3372
3217 if (!atomic_dec_and_test(&eb->refs)) 3373 if (!atomic_dec_and_test(&eb->refs))
3218 return exists; 3374 return exists;
3219 btrfs_release_extent_buffer(eb); 3375 btrfs_release_extent_buffer(eb);
@@ -3264,10 +3420,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3264 continue; 3420 continue;
3265 3421
3266 lock_page(page); 3422 lock_page(page);
3423 WARN_ON(!PagePrivate(page));
3424
3425 set_page_extent_mapped(page);
3267 if (i == 0) 3426 if (i == 0)
3268 set_page_extent_head(page, eb->len); 3427 set_page_extent_head(page, eb->len);
3269 else
3270 set_page_private(page, EXTENT_PAGE_PRIVATE);
3271 3428
3272 clear_page_dirty_for_io(page); 3429 clear_page_dirty_for_io(page);
3273 spin_lock_irq(&page->mapping->tree_lock); 3430 spin_lock_irq(&page->mapping->tree_lock);
@@ -3334,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3334 num_pages = num_extent_pages(eb->start, eb->len); 3491 num_pages = num_extent_pages(eb->start, eb->len);
3335 3492
3336 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3493 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3337 GFP_NOFS); 3494 NULL, GFP_NOFS);
3338 for (i = 0; i < num_pages; i++) { 3495 for (i = 0; i < num_pages; i++) {
3339 page = extent_buffer_page(eb, i); 3496 page = extent_buffer_page(eb, i);
3340 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3497 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3457,6 +3614,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3457 3614
3458 for (i = start_i; i < num_pages; i++) { 3615 for (i = start_i; i < num_pages; i++) {
3459 page = extent_buffer_page(eb, i); 3616 page = extent_buffer_page(eb, i);
3617
3618 WARN_ON(!PagePrivate(page));
3619
3620 set_page_extent_mapped(page);
3621 if (i == 0)
3622 set_page_extent_head(page, eb->len);
3623
3460 if (inc_all_pages) 3624 if (inc_all_pages)
3461 page_cache_get(page); 3625 page_cache_get(page);
3462 if (!PageUptodate(page)) { 3626 if (!PageUptodate(page)) {
@@ -3562,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3562 "wanted %lu %lu\n", (unsigned long long)eb->start, 3726 "wanted %lu %lu\n", (unsigned long long)eb->start,
3563 eb->len, start, min_len); 3727 eb->len, start, min_len);
3564 WARN_ON(1); 3728 WARN_ON(1);
3729 return -EINVAL;
3565 } 3730 }
3566 3731
3567 p = extent_buffer_page(eb, i); 3732 p = extent_buffer_page(eb, i);
@@ -3754,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3754 kunmap_atomic(dst_kaddr, KM_USER0); 3919 kunmap_atomic(dst_kaddr, KM_USER0);
3755} 3920}
3756 3921
3922static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3923{
3924 unsigned long distance = (src > dst) ? src - dst : dst - src;
3925 return distance < len;
3926}
3927
3757static void copy_pages(struct page *dst_page, struct page *src_page, 3928static void copy_pages(struct page *dst_page, struct page *src_page,
3758 unsigned long dst_off, unsigned long src_off, 3929 unsigned long dst_off, unsigned long src_off,
3759 unsigned long len) 3930 unsigned long len)
@@ -3761,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3761 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3932 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3762 char *src_kaddr; 3933 char *src_kaddr;
3763 3934
3764 if (dst_page != src_page) 3935 if (dst_page != src_page) {
3765 src_kaddr = kmap_atomic(src_page, KM_USER1); 3936 src_kaddr = kmap_atomic(src_page, KM_USER1);
3766 else 3937 } else {
3767 src_kaddr = dst_kaddr; 3938 src_kaddr = dst_kaddr;
3939 BUG_ON(areas_overlap(src_off, dst_off, len));
3940 }
3768 3941
3769 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3942 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3770 kunmap_atomic(dst_kaddr, KM_USER0); 3943 kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3839,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3839 "len %lu len %lu\n", dst_offset, len, dst->len); 4012 "len %lu len %lu\n", dst_offset, len, dst->len);
3840 BUG_ON(1); 4013 BUG_ON(1);
3841 } 4014 }
3842 if (dst_offset < src_offset) { 4015 if (!areas_overlap(src_offset, dst_offset, len)) {
3843 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4016 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3844 return; 4017 return;
3845 } 4018 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,13 +20,18 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
28#define EXTENT_BUFFER_BLOCKING 1 32#define EXTENT_BUFFER_BLOCKING 1
29#define EXTENT_BUFFER_DIRTY 2 33#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3
30 35
31/* these are flags for extent_clear_unlock_delalloc */ 36/* these are flags for extent_clear_unlock_delalloc */
32#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -135,6 +140,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 140 wait_queue_head_t lock_wq;
136}; 141};
137 142
143static inline void extent_set_compress_type(unsigned long *bio_flags,
144 int compress_type)
145{
146 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
147}
148
149static inline int extent_compress_type(unsigned long bio_flags)
150{
151 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
152}
153
138struct extent_map_tree; 154struct extent_map_tree;
139 155
140static inline struct extent_state *extent_state_next(struct extent_state *state) 156static inline struct extent_state *extent_state_next(struct extent_state *state)
@@ -176,7 +192,7 @@ void extent_io_exit(void);
176 192
177u64 count_range_bits(struct extent_io_tree *tree, 193u64 count_range_bits(struct extent_io_tree *tree,
178 u64 *start, u64 search_end, 194 u64 *start, u64 search_end,
179 u64 max_bytes, unsigned long bits); 195 u64 max_bytes, unsigned long bits, int contig);
180 196
181void free_extent_state(struct extent_state *state); 197void free_extent_state(struct extent_state *state);
182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 198int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start, 208 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask); 209 struct extent_state **cached_state, gfp_t mask);
194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 210int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
195 gfp_t mask); 211 struct extent_state **cached_state, gfp_t mask);
196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 212int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
197 gfp_t mask); 213 gfp_t mask);
198int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 214int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..a24a3f2fa13e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -50,10 +51,11 @@ struct extent_map *alloc_extent_map(gfp_t mask)
50{ 51{
51 struct extent_map *em; 52 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, mask); 53 em = kmem_cache_alloc(extent_map_cache, mask);
53 if (!em || IS_ERR(em)) 54 if (!em)
54 return em; 55 return NULL;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
@@ -241,7 +243,7 @@ out:
241 * Insert @em into @tree or perform a simple forward/backward merge with 243 * Insert @em into @tree or perform a simple forward/backward merge with
242 * existing mappings. The extent_map struct passed in will be inserted 244 * existing mappings. The extent_map struct passed in will be inserted
243 * into the tree directly, with an additional reference taken, or a 245 * into the tree directly, with an additional reference taken, or a
244 * reference dropped if the merge attempt was successfull. 246 * reference dropped if the merge attempt was successful.
245 */ 247 */
246int add_extent_mapping(struct extent_map_tree *tree, 248int add_extent_mapping(struct extent_map_tree *tree,
247 struct extent_map *em) 249 struct extent_map *em)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..a6a9d4e8b491 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
48 struct extent_buffer *leaf; 48 struct extent_buffer *leaf;
49 49
50 path = btrfs_alloc_path(); 50 path = btrfs_alloc_path();
51 BUG_ON(!path); 51 if (!path)
52 return -ENOMEM;
52 file_key.objectid = objectid; 53 file_key.objectid = objectid;
53 file_key.offset = pos; 54 file_key.offset = pos;
54 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 55 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
169 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
170 171
171 path = btrfs_alloc_path(); 172 path = btrfs_alloc_path();
173 if (!path)
174 return -ENOMEM;
172 if (bio->bi_size > PAGE_CACHE_SIZE * 8) 175 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
173 path->reada = 2; 176 path->reada = 2;
174 177
@@ -536,6 +539,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
536 root = root->fs_info->csum_root; 539 root = root->fs_info->csum_root;
537 540
538 path = btrfs_alloc_path(); 541 path = btrfs_alloc_path();
542 if (!path)
543 return -ENOMEM;
539 544
540 while (1) { 545 while (1) {
541 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 546 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +553,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
548 if (path->slots[0] == 0) 553 if (path->slots[0] == 0)
549 goto out; 554 goto out;
550 path->slots[0]--; 555 path->slots[0]--;
556 } else if (ret < 0) {
557 goto out;
551 } 558 }
559
552 leaf = path->nodes[0]; 560 leaf = path->nodes[0];
553 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 561 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
554 562
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -44,14 +45,14 @@
44 * and be replaced with calls into generic code. 45 * and be replaced with calls into generic code.
45 */ 46 */
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 48 size_t write_bytes,
48 struct page **prepared_pages, 49 struct page **prepared_pages,
49 struct iov_iter *i) 50 struct iov_iter *i)
50{ 51{
51 size_t copied = 0; 52 size_t copied = 0;
53 size_t total_copied = 0;
52 int pg = 0; 54 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 55 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 int total_copied = 0;
55 56
56 while (write_bytes > 0) { 57 while (write_bytes > 0) {
57 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
@@ -69,14 +70,26 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
69 70
70 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
71 flush_dcache_page(page); 72 flush_dcache_page(page);
73
74 /*
75 * if we get a partial write, we can end up with
76 * partially up to date pages. These add
77 * a lot of complexity, so make sure they don't
78 * happen by forcing this copy to be retried.
79 *
80 * The rest of the btrfs_file_write code will fall
81 * back to page at a time copies after we return 0.
82 */
83 if (!PageUptodate(page) && copied < count)
84 copied = 0;
85
72 iov_iter_advance(i, copied); 86 iov_iter_advance(i, copied);
73 write_bytes -= copied; 87 write_bytes -= copied;
74 total_copied += copied; 88 total_copied += copied;
75 89
76 /* Return to btrfs_file_aio_write to fault page */ 90 /* Return to btrfs_file_aio_write to fault page */
77 if (unlikely(copied == 0)) { 91 if (unlikely(copied == 0))
78 break; 92 break;
79 }
80 93
81 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 94 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
82 offset += copied; 95 offset += copied;
@@ -91,12 +104,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
91/* 104/*
92 * unlocks pages after btrfs_file_write is done with them 105 * unlocks pages after btrfs_file_write is done with them
93 */ 106 */
94static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) 107void btrfs_drop_pages(struct page **pages, size_t num_pages)
95{ 108{
96 size_t i; 109 size_t i;
97 for (i = 0; i < num_pages; i++) { 110 for (i = 0; i < num_pages; i++) {
98 if (!pages[i])
99 break;
100 /* page checked is some magic around finding pages that 111 /* page checked is some magic around finding pages that
101 * have been modified without going through btrfs_set_page_dirty 112 * have been modified without going through btrfs_set_page_dirty
102 * clear it here 113 * clear it here
@@ -116,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
116 * this also makes the decision about creating an inline extent vs 127 * this also makes the decision about creating an inline extent vs
117 * doing real data extents, marking pages dirty and delalloc as required. 128 * doing real data extents, marking pages dirty and delalloc as required.
118 */ 129 */
119static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, 130int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
120 struct btrfs_root *root, 131 struct page **pages, size_t num_pages,
121 struct file *file, 132 loff_t pos, size_t write_bytes,
122 struct page **pages, 133 struct extent_state **cached)
123 size_t num_pages,
124 loff_t pos,
125 size_t write_bytes)
126{ 134{
127 int err = 0; 135 int err = 0;
128 int i; 136 int i;
129 struct inode *inode = fdentry(file)->d_inode;
130 u64 num_bytes; 137 u64 num_bytes;
131 u64 start_pos; 138 u64 start_pos;
132 u64 end_of_last_block; 139 u64 end_of_last_block;
@@ -139,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
139 146
140 end_of_last_block = start_pos + num_bytes - 1; 147 end_of_last_block = start_pos + num_bytes - 1;
141 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 148 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
142 NULL); 149 cached);
143 BUG_ON(err); 150 if (err)
151 return err;
144 152
145 for (i = 0; i < num_pages; i++) { 153 for (i = 0; i < num_pages; i++) {
146 struct page *p = pages[i]; 154 struct page *p = pages[i];
@@ -148,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
148 ClearPageChecked(p); 156 ClearPageChecked(p);
149 set_page_dirty(p); 157 set_page_dirty(p);
150 } 158 }
151 if (end_pos > isize) { 159
160 /*
161 * we've only changed i_size in ram, and we haven't updated
162 * the disk i_size. There is no need to log the inode
163 * at this time.
164 */
165 if (end_pos > isize)
152 i_size_write(inode, end_pos); 166 i_size_write(inode, end_pos);
153 /* we've only changed i_size in ram, and we haven't updated
154 * the disk i_size. There is no need to log the inode
155 * at this time.
156 */
157 }
158 return 0; 167 return 0;
159} 168}
160 169
@@ -185,6 +194,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
185 split = alloc_extent_map(GFP_NOFS); 194 split = alloc_extent_map(GFP_NOFS);
186 if (!split2) 195 if (!split2)
187 split2 = alloc_extent_map(GFP_NOFS); 196 split2 = alloc_extent_map(GFP_NOFS);
197 BUG_ON(!split || !split2);
188 198
189 write_lock(&em_tree->lock); 199 write_lock(&em_tree->lock);
190 em = lookup_extent_mapping(em_tree, start, len); 200 em = lookup_extent_mapping(em_tree, start, len);
@@ -224,6 +234,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
224 234
225 split->bdev = em->bdev; 235 split->bdev = em->bdev;
226 split->flags = flags; 236 split->flags = flags;
237 split->compress_type = em->compress_type;
227 ret = add_extent_mapping(em_tree, split); 238 ret = add_extent_mapping(em_tree, split);
228 BUG_ON(ret); 239 BUG_ON(ret);
229 free_extent_map(split); 240 free_extent_map(split);
@@ -238,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
238 split->len = em->start + em->len - (start + len); 249 split->len = em->start + em->len - (start + len);
239 split->bdev = em->bdev; 250 split->bdev = em->bdev;
240 split->flags = flags; 251 split->flags = flags;
252 split->compress_type = em->compress_type;
241 253
242 if (compressed) { 254 if (compressed) {
243 split->block_len = em->block_len; 255 split->block_len = em->block_len;
@@ -593,6 +605,8 @@ again:
593 key.offset = split; 605 key.offset = split;
594 606
595 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 607 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
608 if (ret < 0)
609 goto out;
596 if (ret > 0 && path->slots[0] > 0) 610 if (ret > 0 && path->slots[0] > 0)
597 path->slots[0]--; 611 path->slots[0]--;
598 612
@@ -759,6 +773,27 @@ out:
759} 773}
760 774
761/* 775/*
776 * on error we return an unlocked page and the error value
777 * on success we return a locked page and 0
778 */
779static int prepare_uptodate_page(struct page *page, u64 pos)
780{
781 int ret = 0;
782
783 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
784 ret = btrfs_readpage(NULL, page);
785 if (ret)
786 return ret;
787 lock_page(page);
788 if (!PageUptodate(page)) {
789 unlock_page(page);
790 return -EIO;
791 }
792 }
793 return 0;
794}
795
796/*
762 * this gets pages into the page cache and locks them down, it also properly 797 * this gets pages into the page cache and locks them down, it also properly
763 * waits for data=ordered extents to finish before allowing the pages to be 798 * waits for data=ordered extents to finish before allowing the pages to be
764 * modified. 799 * modified.
@@ -773,6 +808,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
773 unsigned long index = pos >> PAGE_CACHE_SHIFT; 808 unsigned long index = pos >> PAGE_CACHE_SHIFT;
774 struct inode *inode = fdentry(file)->d_inode; 809 struct inode *inode = fdentry(file)->d_inode;
775 int err = 0; 810 int err = 0;
811 int faili = 0;
776 u64 start_pos; 812 u64 start_pos;
777 u64 last_pos; 813 u64 last_pos;
778 814
@@ -780,21 +816,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
780 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 816 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
781 817
782 if (start_pos > inode->i_size) { 818 if (start_pos > inode->i_size) {
783 err = btrfs_cont_expand(inode, start_pos); 819 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
784 if (err) 820 if (err)
785 return err; 821 return err;
786 } 822 }
787 823
788 memset(pages, 0, num_pages * sizeof(struct page *));
789again: 824again:
790 for (i = 0; i < num_pages; i++) { 825 for (i = 0; i < num_pages; i++) {
791 pages[i] = grab_cache_page(inode->i_mapping, index + i); 826 pages[i] = grab_cache_page(inode->i_mapping, index + i);
792 if (!pages[i]) { 827 if (!pages[i]) {
828 faili = i - 1;
793 err = -ENOMEM; 829 err = -ENOMEM;
794 BUG_ON(1); 830 goto fail;
831 }
832
833 if (i == 0)
834 err = prepare_uptodate_page(pages[i], pos);
835 if (i == num_pages - 1)
836 err = prepare_uptodate_page(pages[i],
837 pos + write_bytes);
838 if (err) {
839 page_cache_release(pages[i]);
840 faili = i - 1;
841 goto fail;
795 } 842 }
796 wait_on_page_writeback(pages[i]); 843 wait_on_page_writeback(pages[i]);
797 } 844 }
845 err = 0;
798 if (start_pos < inode->i_size) { 846 if (start_pos < inode->i_size) {
799 struct btrfs_ordered_extent *ordered; 847 struct btrfs_ordered_extent *ordered;
800 lock_extent_bits(&BTRFS_I(inode)->io_tree, 848 lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -834,176 +882,103 @@ again:
834 WARN_ON(!PageLocked(pages[i])); 882 WARN_ON(!PageLocked(pages[i]));
835 } 883 }
836 return 0; 884 return 0;
885fail:
886 while (faili >= 0) {
887 unlock_page(pages[faili]);
888 page_cache_release(pages[faili]);
889 faili--;
890 }
891 return err;
892
837} 893}
838 894
839static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 895static noinline ssize_t __btrfs_buffered_write(struct file *file,
840 const struct iovec *iov, 896 struct iov_iter *i,
841 unsigned long nr_segs, loff_t pos) 897 loff_t pos)
842{ 898{
843 struct file *file = iocb->ki_filp;
844 struct inode *inode = fdentry(file)->d_inode; 899 struct inode *inode = fdentry(file)->d_inode;
845 struct btrfs_root *root = BTRFS_I(inode)->root; 900 struct btrfs_root *root = BTRFS_I(inode)->root;
846 struct page *pinned[2];
847 struct page **pages = NULL; 901 struct page **pages = NULL;
848 struct iov_iter i;
849 loff_t *ppos = &iocb->ki_pos;
850 loff_t start_pos;
851 ssize_t num_written = 0;
852 ssize_t err = 0;
853 size_t count;
854 size_t ocount;
855 int ret = 0;
856 int nrptrs;
857 unsigned long first_index; 902 unsigned long first_index;
858 unsigned long last_index; 903 unsigned long last_index;
859 int will_write; 904 size_t num_written = 0;
860 int buffered = 0; 905 int nrptrs;
861 int copied = 0; 906 int ret = 0;
862 int dirty_pages = 0;
863
864 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
865 (file->f_flags & O_DIRECT));
866
867 pinned[0] = NULL;
868 pinned[1] = NULL;
869
870 start_pos = pos;
871
872 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
873
874 mutex_lock(&inode->i_mutex);
875
876 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
877 if (err)
878 goto out;
879 count = ocount;
880
881 current->backing_dev_info = inode->i_mapping->backing_dev_info;
882 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
883 if (err)
884 goto out;
885
886 if (count == 0)
887 goto out;
888
889 err = file_remove_suid(file);
890 if (err)
891 goto out;
892
893 file_update_time(file);
894 BTRFS_I(inode)->sequence++;
895
896 if (unlikely(file->f_flags & O_DIRECT)) {
897 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
898 pos, ppos, count,
899 ocount);
900 /*
901 * the generic O_DIRECT will update in-memory i_size after the
902 * DIOs are done. But our endio handlers that update the on
903 * disk i_size never update past the in memory i_size. So we
904 * need one more update here to catch any additions to the
905 * file
906 */
907 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
908 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
909 mark_inode_dirty(inode);
910 }
911
912 if (num_written < 0) {
913 ret = num_written;
914 num_written = 0;
915 goto out;
916 } else if (num_written == count) {
917 /* pick up pos changes done by the generic code */
918 pos = *ppos;
919 goto out;
920 }
921 /*
922 * We are going to do buffered for the rest of the range, so we
923 * need to make sure to invalidate the buffered pages when we're
924 * done.
925 */
926 buffered = 1;
927 pos += num_written;
928 }
929 907
930 iov_iter_init(&i, iov, nr_segs, count, num_written); 908 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
931 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
932 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 909 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
933 (sizeof(struct page *))); 910 (sizeof(struct page *)));
934 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 911 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
935 912 if (!pages)
936 /* generic_write_checks can change our pos */ 913 return -ENOMEM;
937 start_pos = pos;
938 914
939 first_index = pos >> PAGE_CACHE_SHIFT; 915 first_index = pos >> PAGE_CACHE_SHIFT;
940 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 916 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
941
942 /*
943 * there are lots of better ways to do this, but this code
944 * makes sure the first and last page in the file range are
945 * up to date and ready for cow
946 */
947 if ((pos & (PAGE_CACHE_SIZE - 1))) {
948 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
949 if (!PageUptodate(pinned[0])) {
950 ret = btrfs_readpage(NULL, pinned[0]);
951 BUG_ON(ret);
952 wait_on_page_locked(pinned[0]);
953 } else {
954 unlock_page(pinned[0]);
955 }
956 }
957 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
958 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
959 if (!PageUptodate(pinned[1])) {
960 ret = btrfs_readpage(NULL, pinned[1]);
961 BUG_ON(ret);
962 wait_on_page_locked(pinned[1]);
963 } else {
964 unlock_page(pinned[1]);
965 }
966 }
967 917
968 while (iov_iter_count(&i) > 0) { 918 while (iov_iter_count(i) > 0) {
969 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 919 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
970 size_t write_bytes = min(iov_iter_count(&i), 920 size_t write_bytes = min(iov_iter_count(i),
971 nrptrs * (size_t)PAGE_CACHE_SIZE - 921 nrptrs * (size_t)PAGE_CACHE_SIZE -
972 offset); 922 offset);
973 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 923 size_t num_pages = (write_bytes + offset +
974 PAGE_CACHE_SHIFT; 924 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
925 size_t dirty_pages;
926 size_t copied;
975 927
976 WARN_ON(num_pages > nrptrs); 928 WARN_ON(num_pages > nrptrs);
977 memset(pages, 0, sizeof(struct page *) * nrptrs);
978 929
979 /* 930 /*
980 * Fault pages before locking them in prepare_pages 931 * Fault pages before locking them in prepare_pages
981 * to avoid recursive lock 932 * to avoid recursive lock
982 */ 933 */
983 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { 934 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
984 ret = -EFAULT; 935 ret = -EFAULT;
985 goto out; 936 break;
986 } 937 }
987 938
988 ret = btrfs_delalloc_reserve_space(inode, 939 ret = btrfs_delalloc_reserve_space(inode,
989 num_pages << PAGE_CACHE_SHIFT); 940 num_pages << PAGE_CACHE_SHIFT);
990 if (ret) 941 if (ret)
991 goto out; 942 break;
992 943
944 /*
945 * This is going to setup the pages array with the number of
946 * pages we want, so we don't really need to worry about the
947 * contents of pages from loop to loop
948 */
993 ret = prepare_pages(root, file, pages, num_pages, 949 ret = prepare_pages(root, file, pages, num_pages,
994 pos, first_index, last_index, 950 pos, first_index, last_index,
995 write_bytes); 951 write_bytes);
996 if (ret) { 952 if (ret) {
997 btrfs_delalloc_release_space(inode, 953 btrfs_delalloc_release_space(inode,
998 num_pages << PAGE_CACHE_SHIFT); 954 num_pages << PAGE_CACHE_SHIFT);
999 goto out; 955 break;
1000 } 956 }
1001 957
1002 copied = btrfs_copy_from_user(pos, num_pages, 958 copied = btrfs_copy_from_user(pos, num_pages,
1003 write_bytes, pages, &i); 959 write_bytes, pages, i);
1004 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
1005 PAGE_CACHE_SHIFT;
1006 960
961 /*
962 * if we have trouble faulting in the pages, fall
963 * back to one page at a time
964 */
965 if (copied < write_bytes)
966 nrptrs = 1;
967
968 if (copied == 0)
969 dirty_pages = 0;
970 else
971 dirty_pages = (copied + offset +
972 PAGE_CACHE_SIZE - 1) >>
973 PAGE_CACHE_SHIFT;
974
975 /*
976 * If we had a short copy we need to release the excess delaloc
977 * bytes we reserved. We need to increment outstanding_extents
978 * because btrfs_delalloc_release_space will decrement it, but
979 * we still have an outstanding extent for the chunk we actually
980 * managed to copy.
981 */
1007 if (num_pages > dirty_pages) { 982 if (num_pages > dirty_pages) {
1008 if (copied > 0) 983 if (copied > 0)
1009 atomic_inc( 984 atomic_inc(
@@ -1014,43 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1014 } 989 }
1015 990
1016 if (copied > 0) { 991 if (copied > 0) {
1017 dirty_and_release_pages(NULL, root, file, pages, 992 ret = btrfs_dirty_pages(root, inode, pages,
1018 dirty_pages, pos, copied); 993 dirty_pages, pos, copied,
994 NULL);
995 if (ret) {
996 btrfs_delalloc_release_space(inode,
997 dirty_pages << PAGE_CACHE_SHIFT);
998 btrfs_drop_pages(pages, num_pages);
999 break;
1000 }
1019 } 1001 }
1020 1002
1021 btrfs_drop_pages(pages, num_pages); 1003 btrfs_drop_pages(pages, num_pages);
1022 1004
1023 if (copied > 0) { 1005 cond_resched();
1024 if (will_write) { 1006
1025 filemap_fdatawrite_range(inode->i_mapping, pos, 1007 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1026 pos + copied - 1); 1008 dirty_pages);
1027 } else { 1009 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1028 balance_dirty_pages_ratelimited_nr( 1010 btrfs_btree_balance_dirty(root, 1);
1029 inode->i_mapping, 1011 btrfs_throttle(root);
1030 dirty_pages);
1031 if (dirty_pages <
1032 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1033 btrfs_btree_balance_dirty(root, 1);
1034 btrfs_throttle(root);
1035 }
1036 }
1037 1012
1038 pos += copied; 1013 pos += copied;
1039 num_written += copied; 1014 num_written += copied;
1015 }
1040 1016
1041 cond_resched(); 1017 kfree(pages);
1018
1019 return num_written ? num_written : ret;
1020}
1021
1022static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1023 const struct iovec *iov,
1024 unsigned long nr_segs, loff_t pos,
1025 loff_t *ppos, size_t count, size_t ocount)
1026{
1027 struct file *file = iocb->ki_filp;
1028 struct inode *inode = fdentry(file)->d_inode;
1029 struct iov_iter i;
1030 ssize_t written;
1031 ssize_t written_buffered;
1032 loff_t endbyte;
1033 int err;
1034
1035 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1036 count, ocount);
1037
1038 /*
1039 * the generic O_DIRECT will update in-memory i_size after the
1040 * DIOs are done. But our endio handlers that update the on
1041 * disk i_size never update past the in memory i_size. So we
1042 * need one more update here to catch any additions to the
1043 * file
1044 */
1045 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1046 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1047 mark_inode_dirty(inode);
1048 }
1049
1050 if (written < 0 || written == count)
1051 return written;
1052
1053 pos += written;
1054 count -= written;
1055 iov_iter_init(&i, iov, nr_segs, count, written);
1056 written_buffered = __btrfs_buffered_write(file, &i, pos);
1057 if (written_buffered < 0) {
1058 err = written_buffered;
1059 goto out;
1042 } 1060 }
1061 endbyte = pos + written_buffered - 1;
1062 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1063 if (err)
1064 goto out;
1065 written += written_buffered;
1066 *ppos = pos + written_buffered;
1067 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1068 endbyte >> PAGE_CACHE_SHIFT);
1043out: 1069out:
1044 mutex_unlock(&inode->i_mutex); 1070 return written ? written : err;
1045 if (ret) 1071}
1046 err = ret;
1047 1072
1048 kfree(pages); 1073static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1049 if (pinned[0]) 1074 const struct iovec *iov,
1050 page_cache_release(pinned[0]); 1075 unsigned long nr_segs, loff_t pos)
1051 if (pinned[1]) 1076{
1052 page_cache_release(pinned[1]); 1077 struct file *file = iocb->ki_filp;
1053 *ppos = pos; 1078 struct inode *inode = fdentry(file)->d_inode;
1079 struct btrfs_root *root = BTRFS_I(inode)->root;
1080 loff_t *ppos = &iocb->ki_pos;
1081 ssize_t num_written = 0;
1082 ssize_t err = 0;
1083 size_t count, ocount;
1084
1085 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1086
1087 mutex_lock(&inode->i_mutex);
1088
1089 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1090 if (err) {
1091 mutex_unlock(&inode->i_mutex);
1092 goto out;
1093 }
1094 count = ocount;
1095
1096 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1098 if (err) {
1099 mutex_unlock(&inode->i_mutex);
1100 goto out;
1101 }
1102
1103 if (count == 0) {
1104 mutex_unlock(&inode->i_mutex);
1105 goto out;
1106 }
1107
1108 err = file_remove_suid(file);
1109 if (err) {
1110 mutex_unlock(&inode->i_mutex);
1111 goto out;
1112 }
1113
1114 /*
1115 * If BTRFS flips readonly due to some impossible error
1116 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1117 * although we have opened a file as writable, we have
1118 * to stop this write operation to ensure FS consistency.
1119 */
1120 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1121 mutex_unlock(&inode->i_mutex);
1122 err = -EROFS;
1123 goto out;
1124 }
1125
1126 file_update_time(file);
1127 BTRFS_I(inode)->sequence++;
1128
1129 if (unlikely(file->f_flags & O_DIRECT)) {
1130 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1131 pos, ppos, count, ocount);
1132 } else {
1133 struct iov_iter i;
1134
1135 iov_iter_init(&i, iov, nr_segs, count, num_written);
1136
1137 num_written = __btrfs_buffered_write(file, &i, pos);
1138 if (num_written > 0)
1139 *ppos = pos + num_written;
1140 }
1141
1142 mutex_unlock(&inode->i_mutex);
1054 1143
1055 /* 1144 /*
1056 * we want to make sure fsync finds this change 1145 * we want to make sure fsync finds this change
@@ -1065,43 +1154,12 @@ out:
1065 * one running right now. 1154 * one running right now.
1066 */ 1155 */
1067 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1156 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1068 1157 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1069 if (num_written > 0 && will_write) { 1158 err = generic_write_sync(file, pos, num_written);
1070 struct btrfs_trans_handle *trans; 1159 if (err < 0 && num_written > 0)
1071
1072 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1073 if (err)
1074 num_written = err; 1160 num_written = err;
1075
1076 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1077 trans = btrfs_start_transaction(root, 0);
1078 if (IS_ERR(trans)) {
1079 num_written = PTR_ERR(trans);
1080 goto done;
1081 }
1082 mutex_lock(&inode->i_mutex);
1083 ret = btrfs_log_dentry_safe(trans, root,
1084 file->f_dentry);
1085 mutex_unlock(&inode->i_mutex);
1086 if (ret == 0) {
1087 ret = btrfs_sync_log(trans, root);
1088 if (ret == 0)
1089 btrfs_end_transaction(trans, root);
1090 else
1091 btrfs_commit_transaction(trans, root);
1092 } else if (ret != BTRFS_NO_LOG_SYNC) {
1093 btrfs_commit_transaction(trans, root);
1094 } else {
1095 btrfs_end_transaction(trans, root);
1096 }
1097 }
1098 if (file->f_flags & O_DIRECT && buffered) {
1099 invalidate_mapping_pages(inode->i_mapping,
1100 start_pos >> PAGE_CACHE_SHIFT,
1101 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1102 }
1103 } 1161 }
1104done: 1162out:
1105 current->backing_dev_info = NULL; 1163 current->backing_dev_info = NULL;
1106 return num_written ? num_written : err; 1164 return num_written ? num_written : err;
1107} 1165}
@@ -1144,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1144 int ret = 0; 1202 int ret = 0;
1145 struct btrfs_trans_handle *trans; 1203 struct btrfs_trans_handle *trans;
1146 1204
1205 trace_btrfs_sync_file(file, datasync);
1147 1206
1148 /* we wait first, since the writeback may change the inode */ 1207 /* we wait first, since the writeback may change the inode */
1149 root->log_batch++; 1208 root->log_batch++;
@@ -1237,6 +1296,118 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1237 return 0; 1296 return 0;
1238} 1297}
1239 1298
1299static long btrfs_fallocate(struct file *file, int mode,
1300 loff_t offset, loff_t len)
1301{
1302 struct inode *inode = file->f_path.dentry->d_inode;
1303 struct extent_state *cached_state = NULL;
1304 u64 cur_offset;
1305 u64 last_byte;
1306 u64 alloc_start;
1307 u64 alloc_end;
1308 u64 alloc_hint = 0;
1309 u64 locked_end;
1310 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1311 struct extent_map *em;
1312 int ret;
1313
1314 alloc_start = offset & ~mask;
1315 alloc_end = (offset + len + mask) & ~mask;
1316
1317 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1318 if (mode & ~FALLOC_FL_KEEP_SIZE)
1319 return -EOPNOTSUPP;
1320
1321 /*
1322 * wait for ordered IO before we have any locks. We'll loop again
1323 * below with the locks held.
1324 */
1325 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1326
1327 mutex_lock(&inode->i_mutex);
1328 ret = inode_newsize_ok(inode, alloc_end);
1329 if (ret)
1330 goto out;
1331
1332 if (alloc_start > inode->i_size) {
1333 ret = btrfs_cont_expand(inode, i_size_read(inode),
1334 alloc_start);
1335 if (ret)
1336 goto out;
1337 }
1338
1339 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1340 if (ret)
1341 goto out;
1342
1343 locked_end = alloc_end - 1;
1344 while (1) {
1345 struct btrfs_ordered_extent *ordered;
1346
1347 /* the extent lock is ordered inside the running
1348 * transaction
1349 */
1350 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1351 locked_end, 0, &cached_state, GFP_NOFS);
1352 ordered = btrfs_lookup_first_ordered_extent(inode,
1353 alloc_end - 1);
1354 if (ordered &&
1355 ordered->file_offset + ordered->len > alloc_start &&
1356 ordered->file_offset < alloc_end) {
1357 btrfs_put_ordered_extent(ordered);
1358 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1359 alloc_start, locked_end,
1360 &cached_state, GFP_NOFS);
1361 /*
1362 * we can't wait on the range with the transaction
1363 * running or with the extent lock held
1364 */
1365 btrfs_wait_ordered_range(inode, alloc_start,
1366 alloc_end - alloc_start);
1367 } else {
1368 if (ordered)
1369 btrfs_put_ordered_extent(ordered);
1370 break;
1371 }
1372 }
1373
1374 cur_offset = alloc_start;
1375 while (1) {
1376 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1377 alloc_end - cur_offset, 0);
1378 BUG_ON(IS_ERR(em) || !em);
1379 last_byte = min(extent_map_end(em), alloc_end);
1380 last_byte = (last_byte + mask) & ~mask;
1381 if (em->block_start == EXTENT_MAP_HOLE ||
1382 (cur_offset >= inode->i_size &&
1383 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1384 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1385 last_byte - cur_offset,
1386 1 << inode->i_blkbits,
1387 offset + len,
1388 &alloc_hint);
1389 if (ret < 0) {
1390 free_extent_map(em);
1391 break;
1392 }
1393 }
1394 free_extent_map(em);
1395
1396 cur_offset = last_byte;
1397 if (cur_offset >= alloc_end) {
1398 ret = 0;
1399 break;
1400 }
1401 }
1402 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1403 &cached_state, GFP_NOFS);
1404
1405 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1406out:
1407 mutex_unlock(&inode->i_mutex);
1408 return ret;
1409}
1410
1240const struct file_operations btrfs_file_operations = { 1411const struct file_operations btrfs_file_operations = {
1241 .llseek = generic_file_llseek, 1412 .llseek = generic_file_llseek,
1242 .read = do_sync_read, 1413 .read = do_sync_read,
@@ -1248,6 +1419,7 @@ const struct file_operations btrfs_file_operations = {
1248 .open = generic_file_open, 1419 .open = generic_file_open,
1249 .release = btrfs_release_file, 1420 .release = btrfs_release_file,
1250 .fsync = btrfs_sync_file, 1421 .fsync = btrfs_sync_file,
1422 .fallocate = btrfs_fallocate,
1251 .unlocked_ioctl = btrfs_ioctl, 1423 .unlocked_ioctl = btrfs_ioctl,
1252#ifdef CONFIG_COMPAT 1424#ifdef CONFIG_COMPAT
1253 .compat_ioctl = btrfs_ioctl, 1425 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..11d2e9cea09e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -24,6 +24,7 @@
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h" 26#include "disk-io.h"
27#include "extent_io.h"
27 28
28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 29#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -81,6 +82,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
81 return ERR_PTR(-ENOENT); 82 return ERR_PTR(-ENOENT);
82 } 83 }
83 84
85 inode->i_mapping->flags &= ~__GFP_FS;
86
84 spin_lock(&block_group->lock); 87 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) { 88 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode); 89 block_group->inode = igrab(inode);
@@ -222,6 +225,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
222 u64 num_entries; 225 u64 num_entries;
223 u64 num_bitmaps; 226 u64 num_bitmaps;
224 u64 generation; 227 u64 generation;
228 u64 used = btrfs_block_group_used(&block_group->item);
225 u32 cur_crc = ~(u32)0; 229 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0; 230 pgoff_t index = 0;
227 unsigned long first_page_offset; 231 unsigned long first_page_offset;
@@ -393,7 +397,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
393 break; 397 break;
394 398
395 need_loop = 1; 399 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 400 e = kmem_cache_zalloc(btrfs_free_space_cachep,
401 GFP_NOFS);
397 if (!e) { 402 if (!e) {
398 kunmap(page); 403 kunmap(page);
399 unlock_page(page); 404 unlock_page(page);
@@ -405,7 +410,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
405 e->bytes = le64_to_cpu(entry->bytes); 410 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) { 411 if (!e->bytes) {
407 kunmap(page); 412 kunmap(page);
408 kfree(e); 413 kmem_cache_free(btrfs_free_space_cachep, e);
409 unlock_page(page); 414 unlock_page(page);
410 page_cache_release(page); 415 page_cache_release(page);
411 goto free_cache; 416 goto free_cache;
@@ -420,7 +425,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 425 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) { 426 if (!e->bitmap) {
422 kunmap(page); 427 kunmap(page);
423 kfree(e); 428 kmem_cache_free(
429 btrfs_free_space_cachep, e);
424 unlock_page(page); 430 unlock_page(page);
425 page_cache_release(page); 431 page_cache_release(page);
426 goto free_cache; 432 goto free_cache;
@@ -465,6 +471,17 @@ next:
465 index++; 471 index++;
466 } 472 }
467 473
474 spin_lock(&block_group->tree_lock);
475 if (block_group->free_space != (block_group->key.offset - used -
476 block_group->bytes_super)) {
477 spin_unlock(&block_group->tree_lock);
478 printk(KERN_ERR "block group %llu has an wrong amount of free "
479 "space\n", block_group->key.objectid);
480 ret = 0;
481 goto free_cache;
482 }
483 spin_unlock(&block_group->tree_lock);
484
468 ret = 1; 485 ret = 1;
469out: 486out:
470 kfree(checksums); 487 kfree(checksums);
@@ -491,18 +508,23 @@ int btrfs_write_out_cache(struct btrfs_root *root,
491 struct inode *inode; 508 struct inode *inode;
492 struct rb_node *node; 509 struct rb_node *node;
493 struct list_head *pos, *n; 510 struct list_head *pos, *n;
511 struct page **pages;
494 struct page *page; 512 struct page *page;
495 struct extent_state *cached_state = NULL; 513 struct extent_state *cached_state = NULL;
514 struct btrfs_free_cluster *cluster = NULL;
515 struct extent_io_tree *unpin = NULL;
496 struct list_head bitmap_list; 516 struct list_head bitmap_list;
497 struct btrfs_key key; 517 struct btrfs_key key;
518 u64 start, end, len;
498 u64 bytes = 0; 519 u64 bytes = 0;
499 u32 *crc, *checksums; 520 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset; 521 unsigned long first_page_offset;
502 int num_checksums; 522 int index = 0, num_pages = 0;
503 int entries = 0; 523 int entries = 0;
504 int bitmaps = 0; 524 int bitmaps = 0;
505 int ret = 0; 525 int ret = 0;
526 bool next_page = false;
527 bool out_of_space = false;
506 528
507 root = root->fs_info->tree_root; 529 root = root->fs_info->tree_root;
508 530
@@ -530,24 +552,43 @@ int btrfs_write_out_cache(struct btrfs_root *root,
530 return 0; 552 return 0;
531 } 553 }
532 554
533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 555 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
556 PAGE_CACHE_SHIFT;
534 filemap_write_and_wait(inode->i_mapping); 557 filemap_write_and_wait(inode->i_mapping);
535 btrfs_wait_ordered_range(inode, inode->i_size & 558 btrfs_wait_ordered_range(inode, inode->i_size &
536 ~(root->sectorsize - 1), (u64)-1); 559 ~(root->sectorsize - 1), (u64)-1);
537 560
538 /* We need a checksum per page. */ 561 /* We need a checksum per page. */
539 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 562 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
540 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
541 if (!crc) { 563 if (!crc) {
542 iput(inode); 564 iput(inode);
543 return 0; 565 return 0;
544 } 566 }
545 567
568 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
569 if (!pages) {
570 kfree(crc);
571 iput(inode);
572 return 0;
573 }
574
546 /* Since the first page has all of our checksums and our generation we 575 /* Since the first page has all of our checksums and our generation we
547 * need to calculate the offset into the page that we can start writing 576 * need to calculate the offset into the page that we can start writing
548 * our entries. 577 * our entries.
549 */ 578 */
550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 579 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
580
581 /* Get the cluster for this block_group if it exists */
582 if (!list_empty(&block_group->cluster_list))
583 cluster = list_entry(block_group->cluster_list.next,
584 struct btrfs_free_cluster,
585 block_group_list);
586
587 /*
588 * We shouldn't have switched the pinned extents yet so this is the
589 * right one
590 */
591 unpin = root->fs_info->pinned_extents;
551 592
552 /* 593 /*
553 * Lock all pages first so we can lock the extent safely. 594 * Lock all pages first so we can lock the extent safely.
@@ -557,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
557 * after find_get_page at this point. Just putting this here so people 598 * after find_get_page at this point. Just putting this here so people
558 * know and don't freak out. 599 * know and don't freak out.
559 */ 600 */
560 while (index <= last_index) { 601 while (index < num_pages) {
561 page = grab_cache_page(inode->i_mapping, index); 602 page = grab_cache_page(inode->i_mapping, index);
562 if (!page) { 603 if (!page) {
563 pgoff_t i = 0; 604 int i;
564 605
565 while (i < index) { 606 for (i = 0; i < num_pages; i++) {
566 page = find_get_page(inode->i_mapping, i); 607 unlock_page(pages[i]);
567 unlock_page(page); 608 page_cache_release(pages[i]);
568 page_cache_release(page);
569 page_cache_release(page);
570 i++;
571 } 609 }
572 goto out_free; 610 goto out_free;
573 } 611 }
612 pages[index] = page;
574 index++; 613 index++;
575 } 614 }
576 615
@@ -578,6 +617,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
578 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 617 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
579 0, &cached_state, GFP_NOFS); 618 0, &cached_state, GFP_NOFS);
580 619
620 /*
621 * When searching for pinned extents, we need to start at our start
622 * offset.
623 */
624 start = block_group->key.objectid;
625
581 /* Write out the extent entries */ 626 /* Write out the extent entries */
582 do { 627 do {
583 struct btrfs_free_space_entry *entry; 628 struct btrfs_free_space_entry *entry;
@@ -585,18 +630,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
585 unsigned long offset = 0; 630 unsigned long offset = 0;
586 unsigned long start_offset = 0; 631 unsigned long start_offset = 0;
587 632
633 next_page = false;
634
588 if (index == 0) { 635 if (index == 0) {
589 start_offset = first_page_offset; 636 start_offset = first_page_offset;
590 offset = start_offset; 637 offset = start_offset;
591 } 638 }
592 639
593 page = find_get_page(inode->i_mapping, index); 640 if (index >= num_pages) {
641 out_of_space = true;
642 break;
643 }
644
645 page = pages[index];
594 646
595 addr = kmap(page); 647 addr = kmap(page);
596 entry = addr + start_offset; 648 entry = addr + start_offset;
597 649
598 memset(addr, 0, PAGE_CACHE_SIZE); 650 memset(addr, 0, PAGE_CACHE_SIZE);
599 while (1) { 651 while (node && !next_page) {
600 struct btrfs_free_space *e; 652 struct btrfs_free_space *e;
601 653
602 e = rb_entry(node, struct btrfs_free_space, offset_index); 654 e = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -612,12 +664,49 @@ int btrfs_write_out_cache(struct btrfs_root *root,
612 entry->type = BTRFS_FREE_SPACE_EXTENT; 664 entry->type = BTRFS_FREE_SPACE_EXTENT;
613 } 665 }
614 node = rb_next(node); 666 node = rb_next(node);
615 if (!node) 667 if (!node && cluster) {
616 break; 668 node = rb_first(&cluster->root);
669 cluster = NULL;
670 }
617 offset += sizeof(struct btrfs_free_space_entry); 671 offset += sizeof(struct btrfs_free_space_entry);
618 if (offset + sizeof(struct btrfs_free_space_entry) >= 672 if (offset + sizeof(struct btrfs_free_space_entry) >=
619 PAGE_CACHE_SIZE) 673 PAGE_CACHE_SIZE)
674 next_page = true;
675 entry++;
676 }
677
678 /*
679 * We want to add any pinned extents to our free space cache
680 * so we don't leak the space
681 */
682 while (!next_page && (start < block_group->key.objectid +
683 block_group->key.offset)) {
684 ret = find_first_extent_bit(unpin, start, &start, &end,
685 EXTENT_DIRTY);
686 if (ret) {
687 ret = 0;
620 break; 688 break;
689 }
690
691 /* This pinned extent is out of our range */
692 if (start >= block_group->key.objectid +
693 block_group->key.offset)
694 break;
695
696 len = block_group->key.objectid +
697 block_group->key.offset - start;
698 len = min(len, end + 1 - start);
699
700 entries++;
701 entry->offset = cpu_to_le64(start);
702 entry->bytes = cpu_to_le64(len);
703 entry->type = BTRFS_FREE_SPACE_EXTENT;
704
705 start = end + 1;
706 offset += sizeof(struct btrfs_free_space_entry);
707 if (offset + sizeof(struct btrfs_free_space_entry) >=
708 PAGE_CACHE_SIZE)
709 next_page = true;
621 entry++; 710 entry++;
622 } 711 }
623 *crc = ~(u32)0; 712 *crc = ~(u32)0;
@@ -630,25 +719,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
630 719
631 bytes += PAGE_CACHE_SIZE; 720 bytes += PAGE_CACHE_SIZE;
632 721
633 ClearPageChecked(page);
634 set_page_extent_mapped(page);
635 SetPageUptodate(page);
636 set_page_dirty(page);
637
638 /*
639 * We need to release our reference we got for grab_cache_page,
640 * except for the first page which will hold our checksums, we
641 * do that below.
642 */
643 if (index != 0) {
644 unlock_page(page);
645 page_cache_release(page);
646 }
647
648 page_cache_release(page);
649
650 index++; 722 index++;
651 } while (node); 723 } while (node || next_page);
652 724
653 /* Write out the bitmaps */ 725 /* Write out the bitmaps */
654 list_for_each_safe(pos, n, &bitmap_list) { 726 list_for_each_safe(pos, n, &bitmap_list) {
@@ -656,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
656 struct btrfs_free_space *entry = 728 struct btrfs_free_space *entry =
657 list_entry(pos, struct btrfs_free_space, list); 729 list_entry(pos, struct btrfs_free_space, list);
658 730
659 page = find_get_page(inode->i_mapping, index); 731 if (index >= num_pages) {
732 out_of_space = true;
733 break;
734 }
735 page = pages[index];
660 736
661 addr = kmap(page); 737 addr = kmap(page);
662 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 738 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -667,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
667 crc++; 743 crc++;
668 bytes += PAGE_CACHE_SIZE; 744 bytes += PAGE_CACHE_SIZE;
669 745
670 ClearPageChecked(page);
671 set_page_extent_mapped(page);
672 SetPageUptodate(page);
673 set_page_dirty(page);
674 unlock_page(page);
675 page_cache_release(page);
676 page_cache_release(page);
677 list_del_init(&entry->list); 746 list_del_init(&entry->list);
678 index++; 747 index++;
679 } 748 }
680 749
750 if (out_of_space) {
751 btrfs_drop_pages(pages, num_pages);
752 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
753 i_size_read(inode) - 1, &cached_state,
754 GFP_NOFS);
755 ret = 0;
756 goto out_free;
757 }
758
681 /* Zero out the rest of the pages just to make sure */ 759 /* Zero out the rest of the pages just to make sure */
682 while (index <= last_index) { 760 while (index < num_pages) {
683 void *addr; 761 void *addr;
684 762
685 page = find_get_page(inode->i_mapping, index); 763 page = pages[index];
686
687 addr = kmap(page); 764 addr = kmap(page);
688 memset(addr, 0, PAGE_CACHE_SIZE); 765 memset(addr, 0, PAGE_CACHE_SIZE);
689 kunmap(page); 766 kunmap(page);
690 ClearPageChecked(page);
691 set_page_extent_mapped(page);
692 SetPageUptodate(page);
693 set_page_dirty(page);
694 unlock_page(page);
695 page_cache_release(page);
696 page_cache_release(page);
697 bytes += PAGE_CACHE_SIZE; 767 bytes += PAGE_CACHE_SIZE;
698 index++; 768 index++;
699 } 769 }
700 770
701 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
702
703 /* Write the checksums and trans id to the first page */ 771 /* Write the checksums and trans id to the first page */
704 { 772 {
705 void *addr; 773 void *addr;
706 u64 *gen; 774 u64 *gen;
707 775
708 page = find_get_page(inode->i_mapping, 0); 776 page = pages[0];
709 777
710 addr = kmap(page); 778 addr = kmap(page);
711 memcpy(addr, checksums, sizeof(u32) * num_checksums); 779 memcpy(addr, checksums, sizeof(u32) * num_pages);
712 gen = addr + (sizeof(u32) * num_checksums); 780 gen = addr + (sizeof(u32) * num_pages);
713 *gen = trans->transid; 781 *gen = trans->transid;
714 kunmap(page); 782 kunmap(page);
715 ClearPageChecked(page);
716 set_page_extent_mapped(page);
717 SetPageUptodate(page);
718 set_page_dirty(page);
719 unlock_page(page);
720 page_cache_release(page);
721 page_cache_release(page);
722 } 783 }
723 BTRFS_I(inode)->generation = trans->transid;
724 784
785 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
786 bytes, &cached_state);
787 btrfs_drop_pages(pages, num_pages);
725 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 788 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
726 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 789 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
727 790
791 if (ret) {
792 ret = 0;
793 goto out_free;
794 }
795
796 BTRFS_I(inode)->generation = trans->transid;
797
728 filemap_write_and_wait(inode->i_mapping); 798 filemap_write_and_wait(inode->i_mapping);
729 799
730 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 800 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -775,6 +845,7 @@ out_free:
775 BTRFS_I(inode)->generation = 0; 845 BTRFS_I(inode)->generation = 0;
776 } 846 }
777 kfree(checksums); 847 kfree(checksums);
848 kfree(pages);
778 btrfs_update_inode(trans, root, inode); 849 btrfs_update_inode(trans, root, inode);
779 iput(inode); 850 iput(inode);
780 return ret; 851 return ret;
@@ -987,11 +1058,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
987 return entry; 1058 return entry;
988} 1059}
989 1060
990static void unlink_free_space(struct btrfs_block_group_cache *block_group, 1061static inline void
991 struct btrfs_free_space *info) 1062__unlink_free_space(struct btrfs_block_group_cache *block_group,
1063 struct btrfs_free_space *info)
992{ 1064{
993 rb_erase(&info->offset_index, &block_group->free_space_offset); 1065 rb_erase(&info->offset_index, &block_group->free_space_offset);
994 block_group->free_extents--; 1066 block_group->free_extents--;
1067}
1068
1069static void unlink_free_space(struct btrfs_block_group_cache *block_group,
1070 struct btrfs_free_space *info)
1071{
1072 __unlink_free_space(block_group, info);
995 block_group->free_space -= info->bytes; 1073 block_group->free_space -= info->bytes;
996} 1074}
997 1075
@@ -1016,14 +1094,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
1016 u64 max_bytes; 1094 u64 max_bytes;
1017 u64 bitmap_bytes; 1095 u64 bitmap_bytes;
1018 u64 extent_bytes; 1096 u64 extent_bytes;
1097 u64 size = block_group->key.offset;
1019 1098
1020 /* 1099 /*
1021 * The goal is to keep the total amount of memory used per 1gb of space 1100 * The goal is to keep the total amount of memory used per 1gb of space
1022 * at or below 32k, so we need to adjust how much memory we allow to be 1101 * at or below 32k, so we need to adjust how much memory we allow to be
1023 * used by extent based free space tracking 1102 * used by extent based free space tracking
1024 */ 1103 */
1025 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1104 if (size < 1024 * 1024 * 1024)
1026 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1105 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1106 else
1107 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1108 div64_u64(size, 1024 * 1024 * 1024);
1027 1109
1028 /* 1110 /*
1029 * we want to account for 1 more bitmap than what we have so we can make 1111 * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1253,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
1171 recalculate_thresholds(block_group); 1253 recalculate_thresholds(block_group);
1172} 1254}
1173 1255
1256static void free_bitmap(struct btrfs_block_group_cache *block_group,
1257 struct btrfs_free_space *bitmap_info)
1258{
1259 unlink_free_space(block_group, bitmap_info);
1260 kfree(bitmap_info->bitmap);
1261 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1262 block_group->total_bitmaps--;
1263 recalculate_thresholds(block_group);
1264}
1265
1174static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1266static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
1175 struct btrfs_free_space *bitmap_info, 1267 struct btrfs_free_space *bitmap_info,
1176 u64 *offset, u64 *bytes) 1268 u64 *offset, u64 *bytes)
@@ -1195,6 +1287,7 @@ again:
1195 */ 1287 */
1196 search_start = *offset; 1288 search_start = *offset;
1197 search_bytes = *bytes; 1289 search_bytes = *bytes;
1290 search_bytes = min(search_bytes, end - search_start + 1);
1198 ret = search_bitmap(block_group, bitmap_info, &search_start, 1291 ret = search_bitmap(block_group, bitmap_info, &search_start,
1199 &search_bytes); 1292 &search_bytes);
1200 BUG_ON(ret < 0 || search_start != *offset); 1293 BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1304,8 @@ again:
1211 1304
1212 if (*bytes) { 1305 if (*bytes) {
1213 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1306 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1214 if (!bitmap_info->bytes) { 1307 if (!bitmap_info->bytes)
1215 unlink_free_space(block_group, bitmap_info); 1308 free_bitmap(block_group, bitmap_info);
1216 kfree(bitmap_info->bitmap);
1217 kfree(bitmap_info);
1218 block_group->total_bitmaps--;
1219 recalculate_thresholds(block_group);
1220 }
1221 1309
1222 /* 1310 /*
1223 * no entry after this bitmap, but we still have bytes to 1311 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1338,8 @@ again:
1250 return -EAGAIN; 1338 return -EAGAIN;
1251 1339
1252 goto again; 1340 goto again;
1253 } else if (!bitmap_info->bytes) { 1341 } else if (!bitmap_info->bytes)
1254 unlink_free_space(block_group, bitmap_info); 1342 free_bitmap(block_group, bitmap_info);
1255 kfree(bitmap_info->bitmap);
1256 kfree(bitmap_info);
1257 block_group->total_bitmaps--;
1258 recalculate_thresholds(block_group);
1259 }
1260 1343
1261 return 0; 1344 return 0;
1262} 1345}
@@ -1273,9 +1356,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
1273 * If we are below the extents threshold then we can add this as an 1356 * If we are below the extents threshold then we can add this as an
1274 * extent, and don't have to deal with the bitmap 1357 * extent, and don't have to deal with the bitmap
1275 */ 1358 */
1276 if (block_group->free_extents < block_group->extents_thresh && 1359 if (block_group->free_extents < block_group->extents_thresh) {
1277 info->bytes > block_group->sectorsize * 4) 1360 /*
1278 return 0; 1361 * If this block group has some small extents we don't want to
1362 * use up all of our free slots in the cache with them, we want
1363 * to reserve them to larger extents, however if we have plent
1364 * of cache left then go ahead an dadd them, no sense in adding
1365 * the overhead of a bitmap if we don't have to.
1366 */
1367 if (info->bytes <= block_group->sectorsize * 4) {
1368 if (block_group->free_extents * 2 <=
1369 block_group->extents_thresh)
1370 return 0;
1371 } else {
1372 return 0;
1373 }
1374 }
1279 1375
1280 /* 1376 /*
1281 * some block groups are so tiny they can't be enveloped by a bitmap, so 1377 * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -1330,8 +1426,8 @@ new_bitmap:
1330 1426
1331 /* no pre-allocated info, allocate a new one */ 1427 /* no pre-allocated info, allocate a new one */
1332 if (!info) { 1428 if (!info) {
1333 info = kzalloc(sizeof(struct btrfs_free_space), 1429 info = kmem_cache_zalloc(btrfs_free_space_cachep,
1334 GFP_NOFS); 1430 GFP_NOFS);
1335 if (!info) { 1431 if (!info) {
1336 spin_lock(&block_group->tree_lock); 1432 spin_lock(&block_group->tree_lock);
1337 ret = -ENOMEM; 1433 ret = -ENOMEM;
@@ -1353,28 +1449,20 @@ out:
1353 if (info) { 1449 if (info) {
1354 if (info->bitmap) 1450 if (info->bitmap)
1355 kfree(info->bitmap); 1451 kfree(info->bitmap);
1356 kfree(info); 1452 kmem_cache_free(btrfs_free_space_cachep, info);
1357 } 1453 }
1358 1454
1359 return ret; 1455 return ret;
1360} 1456}
1361 1457
1362int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1458bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1363 u64 offset, u64 bytes) 1459 struct btrfs_free_space *info, bool update_stat)
1364{ 1460{
1365 struct btrfs_free_space *right_info = NULL; 1461 struct btrfs_free_space *left_info;
1366 struct btrfs_free_space *left_info = NULL; 1462 struct btrfs_free_space *right_info;
1367 struct btrfs_free_space *info = NULL; 1463 bool merged = false;
1368 int ret = 0; 1464 u64 offset = info->offset;
1369 1465 u64 bytes = info->bytes;
1370 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1371 if (!info)
1372 return -ENOMEM;
1373
1374 info->offset = offset;
1375 info->bytes = bytes;
1376
1377 spin_lock(&block_group->tree_lock);
1378 1466
1379 /* 1467 /*
1380 * first we want to see if there is free space adjacent to the range we 1468 * first we want to see if there is free space adjacent to the range we
@@ -1388,40 +1476,65 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1388 else 1476 else
1389 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1477 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
1390 1478
1391 /*
1392 * If there was no extent directly to the left or right of this new
1393 * extent then we know we're going to have to allocate a new extent, so
1394 * before we do that see if we need to drop this into a bitmap
1395 */
1396 if ((!left_info || left_info->bitmap) &&
1397 (!right_info || right_info->bitmap)) {
1398 ret = insert_into_bitmap(block_group, info);
1399
1400 if (ret < 0) {
1401 goto out;
1402 } else if (ret) {
1403 ret = 0;
1404 goto out;
1405 }
1406 }
1407
1408 if (right_info && !right_info->bitmap) { 1479 if (right_info && !right_info->bitmap) {
1409 unlink_free_space(block_group, right_info); 1480 if (update_stat)
1481 unlink_free_space(block_group, right_info);
1482 else
1483 __unlink_free_space(block_group, right_info);
1410 info->bytes += right_info->bytes; 1484 info->bytes += right_info->bytes;
1411 kfree(right_info); 1485 kmem_cache_free(btrfs_free_space_cachep, right_info);
1486 merged = true;
1412 } 1487 }
1413 1488
1414 if (left_info && !left_info->bitmap && 1489 if (left_info && !left_info->bitmap &&
1415 left_info->offset + left_info->bytes == offset) { 1490 left_info->offset + left_info->bytes == offset) {
1416 unlink_free_space(block_group, left_info); 1491 if (update_stat)
1492 unlink_free_space(block_group, left_info);
1493 else
1494 __unlink_free_space(block_group, left_info);
1417 info->offset = left_info->offset; 1495 info->offset = left_info->offset;
1418 info->bytes += left_info->bytes; 1496 info->bytes += left_info->bytes;
1419 kfree(left_info); 1497 kmem_cache_free(btrfs_free_space_cachep, left_info);
1498 merged = true;
1420 } 1499 }
1421 1500
1501 return merged;
1502}
1503
1504int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1505 u64 offset, u64 bytes)
1506{
1507 struct btrfs_free_space *info;
1508 int ret = 0;
1509
1510 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1511 if (!info)
1512 return -ENOMEM;
1513
1514 info->offset = offset;
1515 info->bytes = bytes;
1516
1517 spin_lock(&block_group->tree_lock);
1518
1519 if (try_merge_free_space(block_group, info, true))
1520 goto link;
1521
1522 /*
1523 * There was no extent directly to the left or right of this new
1524 * extent then we know we're going to have to allocate a new extent, so
1525 * before we do that see if we need to drop this into a bitmap
1526 */
1527 ret = insert_into_bitmap(block_group, info);
1528 if (ret < 0) {
1529 goto out;
1530 } else if (ret) {
1531 ret = 0;
1532 goto out;
1533 }
1534link:
1422 ret = link_free_space(block_group, info); 1535 ret = link_free_space(block_group, info);
1423 if (ret) 1536 if (ret)
1424 kfree(info); 1537 kmem_cache_free(btrfs_free_space_cachep, info);
1425out: 1538out:
1426 spin_unlock(&block_group->tree_lock); 1539 spin_unlock(&block_group->tree_lock);
1427 1540
@@ -1491,7 +1604,7 @@ again:
1491 kfree(info->bitmap); 1604 kfree(info->bitmap);
1492 block_group->total_bitmaps--; 1605 block_group->total_bitmaps--;
1493 } 1606 }
1494 kfree(info); 1607 kmem_cache_free(btrfs_free_space_cachep, info);
1495 goto out_lock; 1608 goto out_lock;
1496 } 1609 }
1497 1610
@@ -1527,7 +1640,7 @@ again:
1527 /* the hole we're creating ends at the end 1640 /* the hole we're creating ends at the end
1528 * of the info struct, just free the info 1641 * of the info struct, just free the info
1529 */ 1642 */
1530 kfree(info); 1643 kmem_cache_free(btrfs_free_space_cachep, info);
1531 } 1644 }
1532 spin_unlock(&block_group->tree_lock); 1645 spin_unlock(&block_group->tree_lock);
1533 1646
@@ -1600,29 +1713,28 @@ __btrfs_return_cluster_to_free_space(
1600{ 1713{
1601 struct btrfs_free_space *entry; 1714 struct btrfs_free_space *entry;
1602 struct rb_node *node; 1715 struct rb_node *node;
1603 bool bitmap;
1604 1716
1605 spin_lock(&cluster->lock); 1717 spin_lock(&cluster->lock);
1606 if (cluster->block_group != block_group) 1718 if (cluster->block_group != block_group)
1607 goto out; 1719 goto out;
1608 1720
1609 bitmap = cluster->points_to_bitmap;
1610 cluster->block_group = NULL; 1721 cluster->block_group = NULL;
1611 cluster->window_start = 0; 1722 cluster->window_start = 0;
1612 list_del_init(&cluster->block_group_list); 1723 list_del_init(&cluster->block_group_list);
1613 cluster->points_to_bitmap = false;
1614
1615 if (bitmap)
1616 goto out;
1617 1724
1618 node = rb_first(&cluster->root); 1725 node = rb_first(&cluster->root);
1619 while (node) { 1726 while (node) {
1727 bool bitmap;
1728
1620 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1729 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1621 node = rb_next(&entry->offset_index); 1730 node = rb_next(&entry->offset_index);
1622 rb_erase(&entry->offset_index, &cluster->root); 1731 rb_erase(&entry->offset_index, &cluster->root);
1623 BUG_ON(entry->bitmap); 1732
1733 bitmap = (entry->bitmap != NULL);
1734 if (!bitmap)
1735 try_merge_free_space(block_group, entry, false);
1624 tree_insert_offset(&block_group->free_space_offset, 1736 tree_insert_offset(&block_group->free_space_offset,
1625 entry->offset, &entry->offset_index, 0); 1737 entry->offset, &entry->offset_index, bitmap);
1626 } 1738 }
1627 cluster->root = RB_ROOT; 1739 cluster->root = RB_ROOT;
1628 1740
@@ -1659,7 +1771,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1659 unlink_free_space(block_group, info); 1771 unlink_free_space(block_group, info);
1660 if (info->bitmap) 1772 if (info->bitmap)
1661 kfree(info->bitmap); 1773 kfree(info->bitmap);
1662 kfree(info); 1774 kmem_cache_free(btrfs_free_space_cachep, info);
1663 if (need_resched()) { 1775 if (need_resched()) {
1664 spin_unlock(&block_group->tree_lock); 1776 spin_unlock(&block_group->tree_lock);
1665 cond_resched(); 1777 cond_resched();
@@ -1685,19 +1797,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1685 ret = offset; 1797 ret = offset;
1686 if (entry->bitmap) { 1798 if (entry->bitmap) {
1687 bitmap_clear_bits(block_group, entry, offset, bytes); 1799 bitmap_clear_bits(block_group, entry, offset, bytes);
1688 if (!entry->bytes) { 1800 if (!entry->bytes)
1689 unlink_free_space(block_group, entry); 1801 free_bitmap(block_group, entry);
1690 kfree(entry->bitmap);
1691 kfree(entry);
1692 block_group->total_bitmaps--;
1693 recalculate_thresholds(block_group);
1694 }
1695 } else { 1802 } else {
1696 unlink_free_space(block_group, entry); 1803 unlink_free_space(block_group, entry);
1697 entry->offset += bytes; 1804 entry->offset += bytes;
1698 entry->bytes -= bytes; 1805 entry->bytes -= bytes;
1699 if (!entry->bytes) 1806 if (!entry->bytes)
1700 kfree(entry); 1807 kmem_cache_free(btrfs_free_space_cachep, entry);
1701 else 1808 else
1702 link_free_space(block_group, entry); 1809 link_free_space(block_group, entry);
1703 } 1810 }
@@ -1750,48 +1857,24 @@ int btrfs_return_cluster_to_free_space(
1750 1857
1751static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 1858static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1752 struct btrfs_free_cluster *cluster, 1859 struct btrfs_free_cluster *cluster,
1860 struct btrfs_free_space *entry,
1753 u64 bytes, u64 min_start) 1861 u64 bytes, u64 min_start)
1754{ 1862{
1755 struct btrfs_free_space *entry;
1756 int err; 1863 int err;
1757 u64 search_start = cluster->window_start; 1864 u64 search_start = cluster->window_start;
1758 u64 search_bytes = bytes; 1865 u64 search_bytes = bytes;
1759 u64 ret = 0; 1866 u64 ret = 0;
1760 1867
1761 spin_lock(&block_group->tree_lock);
1762 spin_lock(&cluster->lock);
1763
1764 if (!cluster->points_to_bitmap)
1765 goto out;
1766
1767 if (cluster->block_group != block_group)
1768 goto out;
1769
1770 /*
1771 * search_start is the beginning of the bitmap, but at some point it may
1772 * be a good idea to point to the actual start of the free area in the
1773 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1774 * to 1 to make sure we get the bitmap entry
1775 */
1776 entry = tree_search_offset(block_group,
1777 offset_to_bitmap(block_group, search_start),
1778 1, 0);
1779 if (!entry || !entry->bitmap)
1780 goto out;
1781
1782 search_start = min_start; 1868 search_start = min_start;
1783 search_bytes = bytes; 1869 search_bytes = bytes;
1784 1870
1785 err = search_bitmap(block_group, entry, &search_start, 1871 err = search_bitmap(block_group, entry, &search_start,
1786 &search_bytes); 1872 &search_bytes);
1787 if (err) 1873 if (err)
1788 goto out; 1874 return 0;
1789 1875
1790 ret = search_start; 1876 ret = search_start;
1791 bitmap_clear_bits(block_group, entry, ret, bytes); 1877 bitmap_clear_bits(block_group, entry, ret, bytes);
1792out:
1793 spin_unlock(&cluster->lock);
1794 spin_unlock(&block_group->tree_lock);
1795 1878
1796 return ret; 1879 return ret;
1797} 1880}
@@ -1809,10 +1892,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1809 struct rb_node *node; 1892 struct rb_node *node;
1810 u64 ret = 0; 1893 u64 ret = 0;
1811 1894
1812 if (cluster->points_to_bitmap)
1813 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1814 min_start);
1815
1816 spin_lock(&cluster->lock); 1895 spin_lock(&cluster->lock);
1817 if (bytes > cluster->max_size) 1896 if (bytes > cluster->max_size)
1818 goto out; 1897 goto out;
@@ -1825,9 +1904,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1825 goto out; 1904 goto out;
1826 1905
1827 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1906 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1828
1829 while(1) { 1907 while(1) {
1830 if (entry->bytes < bytes || entry->offset < min_start) { 1908 if (entry->bytes < bytes ||
1909 (!entry->bitmap && entry->offset < min_start)) {
1831 struct rb_node *node; 1910 struct rb_node *node;
1832 1911
1833 node = rb_next(&entry->offset_index); 1912 node = rb_next(&entry->offset_index);
@@ -1837,20 +1916,53 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1837 offset_index); 1916 offset_index);
1838 continue; 1917 continue;
1839 } 1918 }
1840 ret = entry->offset;
1841 1919
1842 entry->offset += bytes; 1920 if (entry->bitmap) {
1843 entry->bytes -= bytes; 1921 ret = btrfs_alloc_from_bitmap(block_group,
1922 cluster, entry, bytes,
1923 min_start);
1924 if (ret == 0) {
1925 struct rb_node *node;
1926 node = rb_next(&entry->offset_index);
1927 if (!node)
1928 break;
1929 entry = rb_entry(node, struct btrfs_free_space,
1930 offset_index);
1931 continue;
1932 }
1933 } else {
1844 1934
1845 if (entry->bytes == 0) { 1935 ret = entry->offset;
1846 rb_erase(&entry->offset_index, &cluster->root); 1936
1847 kfree(entry); 1937 entry->offset += bytes;
1938 entry->bytes -= bytes;
1848 } 1939 }
1940
1941 if (entry->bytes == 0)
1942 rb_erase(&entry->offset_index, &cluster->root);
1849 break; 1943 break;
1850 } 1944 }
1851out: 1945out:
1852 spin_unlock(&cluster->lock); 1946 spin_unlock(&cluster->lock);
1853 1947
1948 if (!ret)
1949 return 0;
1950
1951 spin_lock(&block_group->tree_lock);
1952
1953 block_group->free_space -= bytes;
1954 if (entry->bytes == 0) {
1955 block_group->free_extents--;
1956 if (entry->bitmap) {
1957 kfree(entry->bitmap);
1958 block_group->total_bitmaps--;
1959 recalculate_thresholds(block_group);
1960 }
1961 kmem_cache_free(btrfs_free_space_cachep, entry);
1962 }
1963
1964 spin_unlock(&block_group->tree_lock);
1965
1854 return ret; 1966 return ret;
1855} 1967}
1856 1968
@@ -1866,12 +1978,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1866 unsigned long found_bits; 1978 unsigned long found_bits;
1867 unsigned long start = 0; 1979 unsigned long start = 0;
1868 unsigned long total_found = 0; 1980 unsigned long total_found = 0;
1981 int ret;
1869 bool found = false; 1982 bool found = false;
1870 1983
1871 i = offset_to_bit(entry->offset, block_group->sectorsize, 1984 i = offset_to_bit(entry->offset, block_group->sectorsize,
1872 max_t(u64, offset, entry->offset)); 1985 max_t(u64, offset, entry->offset));
1873 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 1986 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
1874 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 1987 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1875 1988
1876again: 1989again:
1877 found_bits = 0; 1990 found_bits = 0;
@@ -1888,7 +2001,7 @@ again:
1888 } 2001 }
1889 2002
1890 if (!found_bits) 2003 if (!found_bits)
1891 return -1; 2004 return -ENOSPC;
1892 2005
1893 if (!found) { 2006 if (!found) {
1894 start = i; 2007 start = i;
@@ -1912,189 +2025,208 @@ again:
1912 2025
1913 cluster->window_start = start * block_group->sectorsize + 2026 cluster->window_start = start * block_group->sectorsize +
1914 entry->offset; 2027 entry->offset;
1915 cluster->points_to_bitmap = true; 2028 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2029 ret = tree_insert_offset(&cluster->root, entry->offset,
2030 &entry->offset_index, 1);
2031 BUG_ON(ret);
1916 2032
1917 return 0; 2033 return 0;
1918} 2034}
1919 2035
1920/* 2036/*
1921 * here we try to find a cluster of blocks in a block group. The goal 2037 * This searches the block group for just extents to fill the cluster with.
1922 * is to find at least bytes free and up to empty_size + bytes free.
1923 * We might not find them all in one contiguous area.
1924 *
1925 * returns zero and sets up cluster if things worked out, otherwise
1926 * it returns -enospc
1927 */ 2038 */
1928int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 2039static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
1929 struct btrfs_root *root, 2040 struct btrfs_free_cluster *cluster,
1930 struct btrfs_block_group_cache *block_group, 2041 u64 offset, u64 bytes, u64 min_bytes)
1931 struct btrfs_free_cluster *cluster,
1932 u64 offset, u64 bytes, u64 empty_size)
1933{ 2042{
2043 struct btrfs_free_space *first = NULL;
1934 struct btrfs_free_space *entry = NULL; 2044 struct btrfs_free_space *entry = NULL;
2045 struct btrfs_free_space *prev = NULL;
2046 struct btrfs_free_space *last;
1935 struct rb_node *node; 2047 struct rb_node *node;
1936 struct btrfs_free_space *next;
1937 struct btrfs_free_space *last = NULL;
1938 u64 min_bytes;
1939 u64 window_start; 2048 u64 window_start;
1940 u64 window_free; 2049 u64 window_free;
1941 u64 max_extent = 0; 2050 u64 max_extent;
1942 bool found_bitmap = false; 2051 u64 max_gap = 128 * 1024;
1943 int ret;
1944 2052
1945 /* for metadata, allow allocates with more holes */ 2053 entry = tree_search_offset(block_group, offset, 0, 1);
1946 if (btrfs_test_opt(root, SSD_SPREAD)) { 2054 if (!entry)
1947 min_bytes = bytes + empty_size; 2055 return -ENOSPC;
1948 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1949 /*
1950 * we want to do larger allocations when we are
1951 * flushing out the delayed refs, it helps prevent
1952 * making more work as we go along.
1953 */
1954 if (trans->transaction->delayed_refs.flushing)
1955 min_bytes = max(bytes, (bytes + empty_size) >> 1);
1956 else
1957 min_bytes = max(bytes, (bytes + empty_size) >> 4);
1958 } else
1959 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1960
1961 spin_lock(&block_group->tree_lock);
1962 spin_lock(&cluster->lock);
1963
1964 /* someone already found a cluster, hooray */
1965 if (cluster->block_group) {
1966 ret = 0;
1967 goto out;
1968 }
1969again:
1970 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1971 if (!entry) {
1972 ret = -ENOSPC;
1973 goto out;
1974 }
1975 2056
1976 /* 2057 /*
1977 * If found_bitmap is true, we exhausted our search for extent entries, 2058 * We don't want bitmaps, so just move along until we find a normal
1978 * and we just want to search all of the bitmaps that we can find, and 2059 * extent entry.
1979 * ignore any extent entries we find.
1980 */ 2060 */
1981 while (entry->bitmap || found_bitmap || 2061 while (entry->bitmap) {
1982 (!entry->bitmap && entry->bytes < min_bytes)) { 2062 node = rb_next(&entry->offset_index);
1983 struct rb_node *node = rb_next(&entry->offset_index); 2063 if (!node)
1984 2064 return -ENOSPC;
1985 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1986 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1987 offset, bytes + empty_size,
1988 min_bytes);
1989 if (!ret)
1990 goto got_it;
1991 }
1992
1993 if (!node) {
1994 ret = -ENOSPC;
1995 goto out;
1996 }
1997 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2065 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1998 } 2066 }
1999 2067
2000 /*
2001 * We already searched all the extent entries from the passed in offset
2002 * to the end and didn't find enough space for the cluster, and we also
2003 * didn't find any bitmaps that met our criteria, just go ahead and exit
2004 */
2005 if (found_bitmap) {
2006 ret = -ENOSPC;
2007 goto out;
2008 }
2009
2010 cluster->points_to_bitmap = false;
2011 window_start = entry->offset; 2068 window_start = entry->offset;
2012 window_free = entry->bytes; 2069 window_free = entry->bytes;
2013 last = entry;
2014 max_extent = entry->bytes; 2070 max_extent = entry->bytes;
2071 first = entry;
2072 last = entry;
2073 prev = entry;
2015 2074
2016 while (1) { 2075 while (window_free <= min_bytes) {
2017 /* out window is just right, lets fill it */ 2076 node = rb_next(&entry->offset_index);
2018 if (window_free >= bytes + empty_size) 2077 if (!node)
2019 break; 2078 return -ENOSPC;
2020 2079 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2021 node = rb_next(&last->offset_index);
2022 if (!node) {
2023 if (found_bitmap)
2024 goto again;
2025 ret = -ENOSPC;
2026 goto out;
2027 }
2028 next = rb_entry(node, struct btrfs_free_space, offset_index);
2029 2080
2030 /* 2081 if (entry->bitmap)
2031 * we found a bitmap, so if this search doesn't result in a
2032 * cluster, we know to go and search again for the bitmaps and
2033 * start looking for space there
2034 */
2035 if (next->bitmap) {
2036 if (!found_bitmap)
2037 offset = next->offset;
2038 found_bitmap = true;
2039 last = next;
2040 continue; 2082 continue;
2041 }
2042
2043 /* 2083 /*
2044 * we haven't filled the empty size and the window is 2084 * we haven't filled the empty size and the window is
2045 * very large. reset and try again 2085 * very large. reset and try again
2046 */ 2086 */
2047 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2087 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2048 next->offset - window_start > (bytes + empty_size) * 2) { 2088 entry->offset - window_start > (min_bytes * 2)) {
2049 entry = next; 2089 first = entry;
2050 window_start = entry->offset; 2090 window_start = entry->offset;
2051 window_free = entry->bytes; 2091 window_free = entry->bytes;
2052 last = entry; 2092 last = entry;
2053 max_extent = entry->bytes; 2093 max_extent = entry->bytes;
2054 } else { 2094 } else {
2055 last = next; 2095 last = entry;
2056 window_free += next->bytes; 2096 window_free += entry->bytes;
2057 if (entry->bytes > max_extent) 2097 if (entry->bytes > max_extent)
2058 max_extent = entry->bytes; 2098 max_extent = entry->bytes;
2059 } 2099 }
2100 prev = entry;
2060 } 2101 }
2061 2102
2062 cluster->window_start = entry->offset; 2103 cluster->window_start = first->offset;
2104
2105 node = &first->offset_index;
2063 2106
2064 /* 2107 /*
2065 * now we've found our entries, pull them out of the free space 2108 * now we've found our entries, pull them out of the free space
2066 * cache and put them into the cluster rbtree 2109 * cache and put them into the cluster rbtree
2067 *
2068 * The cluster includes an rbtree, but only uses the offset index
2069 * of each free space cache entry.
2070 */ 2110 */
2071 while (1) { 2111 do {
2112 int ret;
2113
2114 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2072 node = rb_next(&entry->offset_index); 2115 node = rb_next(&entry->offset_index);
2073 if (entry->bitmap && node) { 2116 if (entry->bitmap)
2074 entry = rb_entry(node, struct btrfs_free_space,
2075 offset_index);
2076 continue; 2117 continue;
2077 } else if (entry->bitmap && !node) {
2078 break;
2079 }
2080 2118
2081 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2119 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2082 ret = tree_insert_offset(&cluster->root, entry->offset, 2120 ret = tree_insert_offset(&cluster->root, entry->offset,
2083 &entry->offset_index, 0); 2121 &entry->offset_index, 0);
2084 BUG_ON(ret); 2122 BUG_ON(ret);
2123 } while (node && entry != last);
2085 2124
2086 if (!node || entry == last) 2125 cluster->max_size = max_extent;
2087 break; 2126
2127 return 0;
2128}
2129
2130/*
2131 * This specifically looks for bitmaps that may work in the cluster, we assume
2132 * that we have already failed to find extents that will work.
2133 */
2134static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2135 struct btrfs_free_cluster *cluster,
2136 u64 offset, u64 bytes, u64 min_bytes)
2137{
2138 struct btrfs_free_space *entry;
2139 struct rb_node *node;
2140 int ret = -ENOSPC;
2141
2142 if (block_group->total_bitmaps == 0)
2143 return -ENOSPC;
2144
2145 entry = tree_search_offset(block_group,
2146 offset_to_bitmap(block_group, offset),
2147 0, 1);
2148 if (!entry)
2149 return -ENOSPC;
2088 2150
2151 node = &entry->offset_index;
2152 do {
2089 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2153 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2154 node = rb_next(&entry->offset_index);
2155 if (!entry->bitmap)
2156 continue;
2157 if (entry->bytes < min_bytes)
2158 continue;
2159 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2160 bytes, min_bytes);
2161 } while (ret && node);
2162
2163 return ret;
2164}
2165
2166/*
2167 * here we try to find a cluster of blocks in a block group. The goal
2168 * is to find at least bytes free and up to empty_size + bytes free.
2169 * We might not find them all in one contiguous area.
2170 *
2171 * returns zero and sets up cluster if things worked out, otherwise
2172 * it returns -enospc
2173 */
2174int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2175 struct btrfs_root *root,
2176 struct btrfs_block_group_cache *block_group,
2177 struct btrfs_free_cluster *cluster,
2178 u64 offset, u64 bytes, u64 empty_size)
2179{
2180 u64 min_bytes;
2181 int ret;
2182
2183 /* for metadata, allow allocates with more holes */
2184 if (btrfs_test_opt(root, SSD_SPREAD)) {
2185 min_bytes = bytes + empty_size;
2186 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2187 /*
2188 * we want to do larger allocations when we are
2189 * flushing out the delayed refs, it helps prevent
2190 * making more work as we go along.
2191 */
2192 if (trans->transaction->delayed_refs.flushing)
2193 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2194 else
2195 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2196 } else
2197 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2198
2199 spin_lock(&block_group->tree_lock);
2200
2201 /*
2202 * If we know we don't have enough space to make a cluster don't even
2203 * bother doing all the work to try and find one.
2204 */
2205 if (block_group->free_space < min_bytes) {
2206 spin_unlock(&block_group->tree_lock);
2207 return -ENOSPC;
2090 } 2208 }
2091 2209
2092 cluster->max_size = max_extent; 2210 spin_lock(&cluster->lock);
2093got_it: 2211
2094 ret = 0; 2212 /* someone already found a cluster, hooray */
2095 atomic_inc(&block_group->count); 2213 if (cluster->block_group) {
2096 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2214 ret = 0;
2097 cluster->block_group = block_group; 2215 goto out;
2216 }
2217
2218 ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
2219 min_bytes);
2220 if (ret)
2221 ret = setup_cluster_bitmap(block_group, cluster, offset,
2222 bytes, min_bytes);
2223
2224 if (!ret) {
2225 atomic_inc(&block_group->count);
2226 list_add_tail(&cluster->block_group_list,
2227 &block_group->cluster_list);
2228 cluster->block_group = block_group;
2229 }
2098out: 2230out:
2099 spin_unlock(&cluster->lock); 2231 spin_unlock(&cluster->lock);
2100 spin_unlock(&block_group->tree_lock); 2232 spin_unlock(&block_group->tree_lock);
@@ -2111,8 +2243,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2111 spin_lock_init(&cluster->refill_lock); 2243 spin_lock_init(&cluster->refill_lock);
2112 cluster->root = RB_ROOT; 2244 cluster->root = RB_ROOT;
2113 cluster->max_size = 0; 2245 cluster->max_size = 0;
2114 cluster->points_to_bitmap = false;
2115 INIT_LIST_HEAD(&cluster->block_group_list); 2246 INIT_LIST_HEAD(&cluster->block_group_list);
2116 cluster->block_group = NULL; 2247 cluster->block_group = NULL;
2117} 2248}
2118 2249
2250int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2251 u64 *trimmed, u64 start, u64 end, u64 minlen)
2252{
2253 struct btrfs_free_space *entry = NULL;
2254 struct btrfs_fs_info *fs_info = block_group->fs_info;
2255 u64 bytes = 0;
2256 u64 actually_trimmed;
2257 int ret = 0;
2258
2259 *trimmed = 0;
2260
2261 while (start < end) {
2262 spin_lock(&block_group->tree_lock);
2263
2264 if (block_group->free_space < minlen) {
2265 spin_unlock(&block_group->tree_lock);
2266 break;
2267 }
2268
2269 entry = tree_search_offset(block_group, start, 0, 1);
2270 if (!entry)
2271 entry = tree_search_offset(block_group,
2272 offset_to_bitmap(block_group,
2273 start),
2274 1, 1);
2275
2276 if (!entry || entry->offset >= end) {
2277 spin_unlock(&block_group->tree_lock);
2278 break;
2279 }
2280
2281 if (entry->bitmap) {
2282 ret = search_bitmap(block_group, entry, &start, &bytes);
2283 if (!ret) {
2284 if (start >= end) {
2285 spin_unlock(&block_group->tree_lock);
2286 break;
2287 }
2288 bytes = min(bytes, end - start);
2289 bitmap_clear_bits(block_group, entry,
2290 start, bytes);
2291 if (entry->bytes == 0)
2292 free_bitmap(block_group, entry);
2293 } else {
2294 start = entry->offset + BITS_PER_BITMAP *
2295 block_group->sectorsize;
2296 spin_unlock(&block_group->tree_lock);
2297 ret = 0;
2298 continue;
2299 }
2300 } else {
2301 start = entry->offset;
2302 bytes = min(entry->bytes, end - start);
2303 unlink_free_space(block_group, entry);
2304 kfree(entry);
2305 }
2306
2307 spin_unlock(&block_group->tree_lock);
2308
2309 if (bytes >= minlen) {
2310 int update_ret;
2311 update_ret = btrfs_update_reserved_bytes(block_group,
2312 bytes, 1, 1);
2313
2314 ret = btrfs_error_discard_extent(fs_info->extent_root,
2315 start,
2316 bytes,
2317 &actually_trimmed);
2318
2319 btrfs_add_free_space(block_group,
2320 start, bytes);
2321 if (!update_ret)
2322 btrfs_update_reserved_bytes(block_group,
2323 bytes, 0, 1);
2324
2325 if (ret)
2326 break;
2327 *trimmed += actually_trimmed;
2328 }
2329 start += bytes;
2330 bytes = 0;
2331
2332 if (fatal_signal_pending(current)) {
2333 ret = -ERESTARTSYS;
2334 break;
2335 }
2336
2337 cond_resched();
2338 }
2339
2340 return ret;
2341}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b5..65c3b935289f 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
68int btrfs_return_cluster_to_free_space( 68int btrfs_return_cluster_to_free_space(
69 struct btrfs_block_group_cache *block_group, 69 struct btrfs_block_group_cache *block_group,
70 struct btrfs_free_cluster *cluster); 70 struct btrfs_free_cluster *cluster);
71int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
72 u64 *trimmed, u64 start, u64 end, u64 minlen);
71#endif 73#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..c05a08f4c411 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
30 int slot; 30 int slot;
31 31
32 path = btrfs_alloc_path(); 32 path = btrfs_alloc_path();
33 BUG_ON(!path); 33 if (!path)
34 return -ENOMEM;
34 35
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID; 36 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1; 37 search_key.type = -1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3798a3aa0d2..fcd66b6a8086 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
50#include "tree-log.h" 50#include "tree-log.h"
51#include "compression.h" 51#include "compression.h"
52#include "locking.h" 52#include "locking.h"
53#include "free-space-cache.h"
53 54
54struct btrfs_iget_args { 55struct btrfs_iget_args {
55 u64 ino; 56 u64 ino;
@@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep;
70struct kmem_cache *btrfs_trans_handle_cachep; 71struct kmem_cache *btrfs_trans_handle_cachep;
71struct kmem_cache *btrfs_transaction_cachep; 72struct kmem_cache *btrfs_transaction_cachep;
72struct kmem_cache *btrfs_path_cachep; 73struct kmem_cache *btrfs_path_cachep;
74struct kmem_cache *btrfs_free_space_cachep;
73 75
74#define S_SHIFT 12 76#define S_SHIFT 12
75static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
83}; 85};
84 86
85static void btrfs_truncate(struct inode *inode); 87static int btrfs_setsize(struct inode *inode, loff_t newsize);
88static int btrfs_truncate(struct inode *inode);
86static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 89static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
87static noinline int cow_file_range(struct inode *inode, 90static noinline int cow_file_range(struct inode *inode,
88 struct page *locked_page, 91 struct page *locked_page,
@@ -90,13 +93,14 @@ static noinline int cow_file_range(struct inode *inode,
90 unsigned long *nr_written, int unlock); 93 unsigned long *nr_written, int unlock);
91 94
92static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 95static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
93 struct inode *inode, struct inode *dir) 96 struct inode *inode, struct inode *dir,
97 const struct qstr *qstr)
94{ 98{
95 int err; 99 int err;
96 100
97 err = btrfs_init_acl(trans, inode, dir); 101 err = btrfs_init_acl(trans, inode, dir);
98 if (!err) 102 if (!err)
99 err = btrfs_xattr_security_init(trans, inode, dir); 103 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
100 return err; 104 return err;
101} 105}
102 106
@@ -108,6 +112,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
108static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 112static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root, struct inode *inode, 113 struct btrfs_root *root, struct inode *inode,
110 u64 start, size_t size, size_t compressed_size, 114 u64 start, size_t size, size_t compressed_size,
115 int compress_type,
111 struct page **compressed_pages) 116 struct page **compressed_pages)
112{ 117{
113 struct btrfs_key key; 118 struct btrfs_key key;
@@ -122,12 +127,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 127 size_t cur_size = size;
123 size_t datasize; 128 size_t datasize;
124 unsigned long offset; 129 unsigned long offset;
125 int use_compress = 0;
126 130
127 if (compressed_size && compressed_pages) { 131 if (compressed_size && compressed_pages)
128 use_compress = 1;
129 cur_size = compressed_size; 132 cur_size = compressed_size;
130 }
131 133
132 path = btrfs_alloc_path(); 134 path = btrfs_alloc_path();
133 if (!path) 135 if (!path)
@@ -159,7 +161,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 161 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 162 ptr = btrfs_file_extent_inline_start(ei);
161 163
162 if (use_compress) { 164 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 165 struct page *cpage;
164 int i = 0; 166 int i = 0;
165 while (compressed_size > 0) { 167 while (compressed_size > 0) {
@@ -176,7 +178,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 178 compressed_size -= cur_size;
177 } 179 }
178 btrfs_set_file_extent_compression(leaf, ei, 180 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 181 compress_type);
180 } else { 182 } else {
181 page = find_get_page(inode->i_mapping, 183 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 184 start >> PAGE_CACHE_SHIFT);
@@ -217,7 +219,7 @@ fail:
217static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 219static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
218 struct btrfs_root *root, 220 struct btrfs_root *root,
219 struct inode *inode, u64 start, u64 end, 221 struct inode *inode, u64 start, u64 end,
220 size_t compressed_size, 222 size_t compressed_size, int compress_type,
221 struct page **compressed_pages) 223 struct page **compressed_pages)
222{ 224{
223 u64 isize = i_size_read(inode); 225 u64 isize = i_size_read(inode);
@@ -250,7 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
250 inline_len = min_t(u64, isize, actual_end); 252 inline_len = min_t(u64, isize, actual_end);
251 ret = insert_inline_extent(trans, root, inode, start, 253 ret = insert_inline_extent(trans, root, inode, start,
252 inline_len, compressed_size, 254 inline_len, compressed_size,
253 compressed_pages); 255 compress_type, compressed_pages);
254 BUG_ON(ret); 256 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start); 257 btrfs_delalloc_release_metadata(inode, end + 1 - start);
256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 258 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -263,6 +265,7 @@ struct async_extent {
263 u64 compressed_size; 265 u64 compressed_size;
264 struct page **pages; 266 struct page **pages;
265 unsigned long nr_pages; 267 unsigned long nr_pages;
268 int compress_type;
266 struct list_head list; 269 struct list_head list;
267}; 270};
268 271
@@ -280,16 +283,19 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 283 u64 start, u64 ram_size,
281 u64 compressed_size, 284 u64 compressed_size,
282 struct page **pages, 285 struct page **pages,
283 unsigned long nr_pages) 286 unsigned long nr_pages,
287 int compress_type)
284{ 288{
285 struct async_extent *async_extent; 289 struct async_extent *async_extent;
286 290
287 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 291 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
292 BUG_ON(!async_extent);
288 async_extent->start = start; 293 async_extent->start = start;
289 async_extent->ram_size = ram_size; 294 async_extent->ram_size = ram_size;
290 async_extent->compressed_size = compressed_size; 295 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 296 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 297 async_extent->nr_pages = nr_pages;
298 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 299 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 300 return 0;
295} 301}
@@ -332,6 +338,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 338 unsigned long max_uncompressed = 128 * 1024;
333 int i; 339 int i;
334 int will_compress; 340 int will_compress;
341 int compress_type = root->fs_info->compress_type;
335 342
336 actual_end = min_t(u64, isize, end + 1); 343 actual_end = min_t(u64, isize, end + 1);
337again: 344again:
@@ -377,16 +384,22 @@ again:
377 */ 384 */
378 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 385 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
379 (btrfs_test_opt(root, COMPRESS) || 386 (btrfs_test_opt(root, COMPRESS) ||
380 (BTRFS_I(inode)->force_compress))) { 387 (BTRFS_I(inode)->force_compress) ||
388 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
381 WARN_ON(pages); 389 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 390 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
391 BUG_ON(!pages);
392
393 if (BTRFS_I(inode)->force_compress)
394 compress_type = BTRFS_I(inode)->force_compress;
383 395
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 396 ret = btrfs_compress_pages(compress_type,
385 total_compressed, pages, 397 inode->i_mapping, start,
386 nr_pages, &nr_pages_ret, 398 total_compressed, pages,
387 &total_in, 399 nr_pages, &nr_pages_ret,
388 &total_compressed, 400 &total_in,
389 max_compressed); 401 &total_compressed,
402 max_compressed);
390 403
391 if (!ret) { 404 if (!ret) {
392 unsigned long offset = total_compressed & 405 unsigned long offset = total_compressed &
@@ -408,7 +421,7 @@ again:
408 } 421 }
409 if (start == 0) { 422 if (start == 0) {
410 trans = btrfs_join_transaction(root, 1); 423 trans = btrfs_join_transaction(root, 1);
411 BUG_ON(!trans); 424 BUG_ON(IS_ERR(trans));
412 btrfs_set_trans_block_group(trans, inode); 425 btrfs_set_trans_block_group(trans, inode);
413 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 426 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
414 427
@@ -418,12 +431,13 @@ again:
418 * to make an uncompressed inline extent. 431 * to make an uncompressed inline extent.
419 */ 432 */
420 ret = cow_file_range_inline(trans, root, inode, 433 ret = cow_file_range_inline(trans, root, inode,
421 start, end, 0, NULL); 434 start, end, 0, 0, NULL);
422 } else { 435 } else {
423 /* try making a compressed inline extent */ 436 /* try making a compressed inline extent */
424 ret = cow_file_range_inline(trans, root, inode, 437 ret = cow_file_range_inline(trans, root, inode,
425 start, end, 438 start, end,
426 total_compressed, pages); 439 total_compressed,
440 compress_type, pages);
427 } 441 }
428 if (ret == 0) { 442 if (ret == 0) {
429 /* 443 /*
@@ -493,7 +507,8 @@ again:
493 * and will submit them to the elevator. 507 * and will submit them to the elevator.
494 */ 508 */
495 add_async_extent(async_cow, start, num_bytes, 509 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 510 total_compressed, pages, nr_pages_ret,
511 compress_type);
497 512
498 if (start + num_bytes < end) { 513 if (start + num_bytes < end) {
499 start += num_bytes; 514 start += num_bytes;
@@ -515,7 +530,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 530 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 531 /* unlocked later on in the async handlers */
517 } 532 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 533 add_async_extent(async_cow, start, end - start + 1,
534 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 535 *num_added += 1;
520 } 536 }
521 537
@@ -602,6 +618,7 @@ retry:
602 GFP_NOFS); 618 GFP_NOFS);
603 619
604 trans = btrfs_join_transaction(root, 1); 620 trans = btrfs_join_transaction(root, 1);
621 BUG_ON(IS_ERR(trans));
605 ret = btrfs_reserve_extent(trans, root, 622 ret = btrfs_reserve_extent(trans, root,
606 async_extent->compressed_size, 623 async_extent->compressed_size,
607 async_extent->compressed_size, 624 async_extent->compressed_size,
@@ -633,6 +650,7 @@ retry:
633 async_extent->ram_size - 1, 0); 650 async_extent->ram_size - 1, 0);
634 651
635 em = alloc_extent_map(GFP_NOFS); 652 em = alloc_extent_map(GFP_NOFS);
653 BUG_ON(!em);
636 em->start = async_extent->start; 654 em->start = async_extent->start;
637 em->len = async_extent->ram_size; 655 em->len = async_extent->ram_size;
638 em->orig_start = em->start; 656 em->orig_start = em->start;
@@ -640,6 +658,7 @@ retry:
640 em->block_start = ins.objectid; 658 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 659 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 660 em->bdev = root->fs_info->fs_devices->latest_bdev;
661 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 662 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 663 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 664
@@ -656,11 +675,13 @@ retry:
656 async_extent->ram_size - 1, 0); 675 async_extent->ram_size - 1, 0);
657 } 676 }
658 677
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 678 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 679 async_extent->start,
661 async_extent->ram_size, 680 ins.objectid,
662 ins.offset, 681 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 682 ins.offset,
683 BTRFS_ORDERED_COMPRESSED,
684 async_extent->compress_type);
664 BUG_ON(ret); 685 BUG_ON(ret);
665 686
666 /* 687 /*
@@ -758,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode,
758 779
759 BUG_ON(root == root->fs_info->tree_root); 780 BUG_ON(root == root->fs_info->tree_root);
760 trans = btrfs_join_transaction(root, 1); 781 trans = btrfs_join_transaction(root, 1);
761 BUG_ON(!trans); 782 BUG_ON(IS_ERR(trans));
762 btrfs_set_trans_block_group(trans, inode); 783 btrfs_set_trans_block_group(trans, inode);
763 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 784 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
764 785
@@ -770,7 +791,7 @@ static noinline int cow_file_range(struct inode *inode,
770 if (start == 0) { 791 if (start == 0) {
771 /* lets try to make an inline extent */ 792 /* lets try to make an inline extent */
772 ret = cow_file_range_inline(trans, root, inode, 793 ret = cow_file_range_inline(trans, root, inode,
773 start, end, 0, NULL); 794 start, end, 0, 0, NULL);
774 if (ret == 0) { 795 if (ret == 0) {
775 extent_clear_unlock_delalloc(inode, 796 extent_clear_unlock_delalloc(inode,
776 &BTRFS_I(inode)->io_tree, 797 &BTRFS_I(inode)->io_tree,
@@ -806,6 +827,7 @@ static noinline int cow_file_range(struct inode *inode,
806 BUG_ON(ret); 827 BUG_ON(ret);
807 828
808 em = alloc_extent_map(GFP_NOFS); 829 em = alloc_extent_map(GFP_NOFS);
830 BUG_ON(!em);
809 em->start = start; 831 em->start = start;
810 em->orig_start = em->start; 832 em->orig_start = em->start;
811 ram_size = ins.offset; 833 ram_size = ins.offset;
@@ -1036,7 +1058,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1036 } else { 1058 } else {
1037 trans = btrfs_join_transaction(root, 1); 1059 trans = btrfs_join_transaction(root, 1);
1038 } 1060 }
1039 BUG_ON(!trans); 1061 BUG_ON(IS_ERR(trans));
1040 1062
1041 cow_start = (u64)-1; 1063 cow_start = (u64)-1;
1042 cur_offset = start; 1064 cur_offset = start;
@@ -1155,6 +1177,7 @@ out_check:
1155 struct extent_map_tree *em_tree; 1177 struct extent_map_tree *em_tree;
1156 em_tree = &BTRFS_I(inode)->extent_tree; 1178 em_tree = &BTRFS_I(inode)->extent_tree;
1157 em = alloc_extent_map(GFP_NOFS); 1179 em = alloc_extent_map(GFP_NOFS);
1180 BUG_ON(!em);
1158 em->start = cur_offset; 1181 em->start = cur_offset;
1159 em->orig_start = em->start; 1182 em->orig_start = em->start;
1160 em->len = num_bytes; 1183 em->len = num_bytes;
@@ -1236,7 +1259,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1236 ret = run_delalloc_nocow(inode, locked_page, start, end, 1259 ret = run_delalloc_nocow(inode, locked_page, start, end,
1237 page_started, 0, nr_written); 1260 page_started, 0, nr_written);
1238 else if (!btrfs_test_opt(root, COMPRESS) && 1261 else if (!btrfs_test_opt(root, COMPRESS) &&
1239 !(BTRFS_I(inode)->force_compress)) 1262 !(BTRFS_I(inode)->force_compress) &&
1263 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
1240 ret = cow_file_range(inode, locked_page, start, end, 1264 ret = cow_file_range(inode, locked_page, start, end,
1241 page_started, nr_written, 1); 1265 page_started, nr_written, 1);
1242 else 1266 else
@@ -1443,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1443 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1467 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1444 return btrfs_submit_compressed_read(inode, bio, 1468 return btrfs_submit_compressed_read(inode, bio,
1445 mirror_num, bio_flags); 1469 mirror_num, bio_flags);
1446 } else if (!skip_sum) 1470 } else if (!skip_sum) {
1447 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1471 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1472 if (ret)
1473 return ret;
1474 }
1448 goto mapit; 1475 goto mapit;
1449 } else if (!skip_sum) { 1476 } else if (!skip_sum) {
1450 /* csum items have already been cloned */ 1477 /* csum items have already been cloned */
@@ -1544,6 +1571,7 @@ out:
1544out_page: 1571out_page:
1545 unlock_page(page); 1572 unlock_page(page);
1546 page_cache_release(page); 1573 page_cache_release(page);
1574 kfree(fixup);
1547} 1575}
1548 1576
1549/* 1577/*
@@ -1670,7 +1698,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1698 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1699 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1700 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1701 int compress_type = 0;
1674 int ret; 1702 int ret;
1675 bool nolock = false; 1703 bool nolock = false;
1676 1704
@@ -1690,7 +1718,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1690 trans = btrfs_join_transaction_nolock(root, 1); 1718 trans = btrfs_join_transaction_nolock(root, 1);
1691 else 1719 else
1692 trans = btrfs_join_transaction(root, 1); 1720 trans = btrfs_join_transaction(root, 1);
1693 BUG_ON(!trans); 1721 BUG_ON(IS_ERR(trans));
1694 btrfs_set_trans_block_group(trans, inode); 1722 btrfs_set_trans_block_group(trans, inode);
1695 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1723 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1696 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
@@ -1707,13 +1735,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1707 trans = btrfs_join_transaction_nolock(root, 1); 1735 trans = btrfs_join_transaction_nolock(root, 1);
1708 else 1736 else
1709 trans = btrfs_join_transaction(root, 1); 1737 trans = btrfs_join_transaction(root, 1);
1738 BUG_ON(IS_ERR(trans));
1710 btrfs_set_trans_block_group(trans, inode); 1739 btrfs_set_trans_block_group(trans, inode);
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1740 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1741
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1742 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1743 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1744 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1745 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1746 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1747 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1748 ordered_extent->file_offset +
@@ -1727,7 +1756,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1756 ordered_extent->disk_len,
1728 ordered_extent->len, 1757 ordered_extent->len,
1729 ordered_extent->len, 1758 ordered_extent->len,
1730 compressed, 0, 0, 1759 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1760 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1761 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1762 ordered_extent->file_offset,
@@ -1741,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1741 add_pending_csums(trans, inode, ordered_extent->file_offset, 1770 add_pending_csums(trans, inode, ordered_extent->file_offset,
1742 &ordered_extent->list); 1771 &ordered_extent->list);
1743 1772
1744 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1773 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1745 ret = btrfs_update_inode(trans, root, inode); 1774 if (!ret) {
1746 BUG_ON(ret); 1775 ret = btrfs_update_inode(trans, root, inode);
1776 BUG_ON(ret);
1777 }
1778 ret = 0;
1747out: 1779out:
1748 if (nolock) { 1780 if (nolock) {
1749 if (trans) 1781 if (trans)
@@ -1765,6 +1797,8 @@ out:
1765static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1797static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1766 struct extent_state *state, int uptodate) 1798 struct extent_state *state, int uptodate)
1767{ 1799{
1800 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1801
1768 ClearPagePrivate2(page); 1802 ClearPagePrivate2(page);
1769 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1803 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1770} 1804}
@@ -1829,6 +1863,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1863 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1864 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1865 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1866 extent_set_compress_type(&failrec->bio_flags,
1867 em->compress_type);
1832 } 1868 }
1833 failrec->logical = logical; 1869 failrec->logical = logical;
1834 free_extent_map(em); 1870 free_extent_map(em);
@@ -1873,10 +1909,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1873 else 1909 else
1874 rw = READ; 1910 rw = READ;
1875 1911
1876 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1912 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1877 failrec->last_mirror, 1913 failrec->last_mirror,
1878 failrec->bio_flags, 0); 1914 failrec->bio_flags, 0);
1879 return 0; 1915 return ret;
1880} 1916}
1881 1917
1882/* 1918/*
@@ -1892,7 +1928,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1892 1928
1893 private = 0; 1929 private = 0;
1894 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1930 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1895 (u64)-1, 1, EXTENT_DIRTY)) { 1931 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1896 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1932 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1897 start, &private_failure); 1933 start, &private_failure);
1898 if (ret == 0) { 1934 if (ret == 0) {
@@ -2188,8 +2224,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2188 insert = 1; 2224 insert = 1;
2189#endif 2225#endif
2190 insert = 1; 2226 insert = 1;
2191 } else {
2192 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2193 } 2227 }
2194 2228
2195 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2229 if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2260,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2260 * this cleans up any orphans that may be left on the list from the last use 2294 * this cleans up any orphans that may be left on the list from the last use
2261 * of this root. 2295 * of this root.
2262 */ 2296 */
2263void btrfs_orphan_cleanup(struct btrfs_root *root) 2297int btrfs_orphan_cleanup(struct btrfs_root *root)
2264{ 2298{
2265 struct btrfs_path *path; 2299 struct btrfs_path *path;
2266 struct extent_buffer *leaf; 2300 struct extent_buffer *leaf;
@@ -2270,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2270 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2304 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2271 2305
2272 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2306 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2273 return; 2307 return 0;
2274 2308
2275 path = btrfs_alloc_path(); 2309 path = btrfs_alloc_path();
2276 BUG_ON(!path); 2310 if (!path) {
2311 ret = -ENOMEM;
2312 goto out;
2313 }
2277 path->reada = -1; 2314 path->reada = -1;
2278 2315
2279 key.objectid = BTRFS_ORPHAN_OBJECTID; 2316 key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2282,18 +2319,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2282 2319
2283 while (1) { 2320 while (1) {
2284 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2321 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2285 if (ret < 0) { 2322 if (ret < 0)
2286 printk(KERN_ERR "Error searching slot for orphan: %d" 2323 goto out;
2287 "\n", ret);
2288 break;
2289 }
2290 2324
2291 /* 2325 /*
2292 * if ret == 0 means we found what we were searching for, which 2326 * if ret == 0 means we found what we were searching for, which
2293 * is weird, but possible, so only screw with path if we didnt 2327 * is weird, but possible, so only screw with path if we didn't
2294 * find the key and see if we have stuff that matches 2328 * find the key and see if we have stuff that matches
2295 */ 2329 */
2296 if (ret > 0) { 2330 if (ret > 0) {
2331 ret = 0;
2297 if (path->slots[0] == 0) 2332 if (path->slots[0] == 0)
2298 break; 2333 break;
2299 path->slots[0]--; 2334 path->slots[0]--;
@@ -2321,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2321 found_key.type = BTRFS_INODE_ITEM_KEY; 2356 found_key.type = BTRFS_INODE_ITEM_KEY;
2322 found_key.offset = 0; 2357 found_key.offset = 0;
2323 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2358 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2324 BUG_ON(IS_ERR(inode)); 2359 if (IS_ERR(inode)) {
2360 ret = PTR_ERR(inode);
2361 goto out;
2362 }
2325 2363
2326 /* 2364 /*
2327 * add this inode to the orphan list so btrfs_orphan_del does 2365 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2339,6 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2339 */ 2377 */
2340 if (is_bad_inode(inode)) { 2378 if (is_bad_inode(inode)) {
2341 trans = btrfs_start_transaction(root, 0); 2379 trans = btrfs_start_transaction(root, 0);
2380 if (IS_ERR(trans)) {
2381 ret = PTR_ERR(trans);
2382 goto out;
2383 }
2342 btrfs_orphan_del(trans, inode); 2384 btrfs_orphan_del(trans, inode);
2343 btrfs_end_transaction(trans, root); 2385 btrfs_end_transaction(trans, root);
2344 iput(inode); 2386 iput(inode);
@@ -2347,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2347 2389
2348 /* if we have links, this was a truncate, lets do that */ 2390 /* if we have links, this was a truncate, lets do that */
2349 if (inode->i_nlink) { 2391 if (inode->i_nlink) {
2392 if (!S_ISREG(inode->i_mode)) {
2393 WARN_ON(1);
2394 iput(inode);
2395 continue;
2396 }
2350 nr_truncate++; 2397 nr_truncate++;
2351 btrfs_truncate(inode); 2398 ret = btrfs_truncate(inode);
2352 } else { 2399 } else {
2353 nr_unlink++; 2400 nr_unlink++;
2354 } 2401 }
2355 2402
2356 /* this will do delete_inode and everything for us */ 2403 /* this will do delete_inode and everything for us */
2357 iput(inode); 2404 iput(inode);
2405 if (ret)
2406 goto out;
2358 } 2407 }
2359 btrfs_free_path(path);
2360
2361 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2408 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2362 2409
2363 if (root->orphan_block_rsv) 2410 if (root->orphan_block_rsv)
@@ -2366,13 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2366 2413
2367 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2414 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2368 trans = btrfs_join_transaction(root, 1); 2415 trans = btrfs_join_transaction(root, 1);
2369 btrfs_end_transaction(trans, root); 2416 if (!IS_ERR(trans))
2417 btrfs_end_transaction(trans, root);
2370 } 2418 }
2371 2419
2372 if (nr_unlink) 2420 if (nr_unlink)
2373 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2421 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2374 if (nr_truncate) 2422 if (nr_truncate)
2375 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2423 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2424
2425out:
2426 if (ret)
2427 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2428 btrfs_free_path(path);
2429 return ret;
2376} 2430}
2377 2431
2378/* 2432/*
@@ -2539,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2539 struct btrfs_inode_item *item, 2593 struct btrfs_inode_item *item,
2540 struct inode *inode) 2594 struct inode *inode)
2541{ 2595{
2596 if (!leaf->map_token)
2597 map_private_extent_buffer(leaf, (unsigned long)item,
2598 sizeof(struct btrfs_inode_item),
2599 &leaf->map_token, &leaf->kaddr,
2600 &leaf->map_start, &leaf->map_len,
2601 KM_USER1);
2602
2542 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2603 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2543 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2604 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2544 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2605 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2567,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2567 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2628 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2568 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2629 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2569 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2630 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2631
2632 if (leaf->map_token) {
2633 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2634 leaf->map_token = NULL;
2635 }
2570} 2636}
2571 2637
2572/* 2638/*
@@ -2611,10 +2677,10 @@ failed:
2611 * recovery code. It remove a link in a directory with a given name, and 2677 * recovery code. It remove a link in a directory with a given name, and
2612 * also drops the back refs in the inode to the directory 2678 * also drops the back refs in the inode to the directory
2613 */ 2679 */
2614int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2680static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2615 struct btrfs_root *root, 2681 struct btrfs_root *root,
2616 struct inode *dir, struct inode *inode, 2682 struct inode *dir, struct inode *inode,
2617 const char *name, int name_len) 2683 const char *name, int name_len)
2618{ 2684{
2619 struct btrfs_path *path; 2685 struct btrfs_path *path;
2620 int ret = 0; 2686 int ret = 0;
@@ -2626,7 +2692,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2626 path = btrfs_alloc_path(); 2692 path = btrfs_alloc_path();
2627 if (!path) { 2693 if (!path) {
2628 ret = -ENOMEM; 2694 ret = -ENOMEM;
2629 goto err; 2695 goto out;
2630 } 2696 }
2631 2697
2632 path->leave_spinning = 1; 2698 path->leave_spinning = 1;
@@ -2686,12 +2752,25 @@ err:
2686 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2752 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2687 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2753 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2688 btrfs_update_inode(trans, root, dir); 2754 btrfs_update_inode(trans, root, dir);
2689 btrfs_drop_nlink(inode);
2690 ret = btrfs_update_inode(trans, root, inode);
2691out: 2755out:
2692 return ret; 2756 return ret;
2693} 2757}
2694 2758
2759int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2760 struct btrfs_root *root,
2761 struct inode *dir, struct inode *inode,
2762 const char *name, int name_len)
2763{
2764 int ret;
2765 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2766 if (!ret) {
2767 btrfs_drop_nlink(inode);
2768 ret = btrfs_update_inode(trans, root, inode);
2769 }
2770 return ret;
2771}
2772
2773
2695/* helper to check if there is any shared block in the path */ 2774/* helper to check if there is any shared block in the path */
2696static int check_path_shared(struct btrfs_root *root, 2775static int check_path_shared(struct btrfs_root *root,
2697 struct btrfs_path *path) 2776 struct btrfs_path *path)
@@ -2699,9 +2778,10 @@ static int check_path_shared(struct btrfs_root *root,
2699 struct extent_buffer *eb; 2778 struct extent_buffer *eb;
2700 int level; 2779 int level;
2701 u64 refs = 1; 2780 u64 refs = 1;
2702 int uninitialized_var(ret);
2703 2781
2704 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2782 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2783 int ret;
2784
2705 if (!path->nodes[level]) 2785 if (!path->nodes[level])
2706 break; 2786 break;
2707 eb = path->nodes[level]; 2787 eb = path->nodes[level];
@@ -2712,7 +2792,7 @@ static int check_path_shared(struct btrfs_root *root,
2712 if (refs > 1) 2792 if (refs > 1)
2713 return 1; 2793 return 1;
2714 } 2794 }
2715 return ret; /* XXX callers? */ 2795 return 0;
2716} 2796}
2717 2797
2718/* 2798/*
@@ -3512,7 +3592,13 @@ out:
3512 return ret; 3592 return ret;
3513} 3593}
3514 3594
3515int btrfs_cont_expand(struct inode *inode, loff_t size) 3595/*
3596 * This function puts in dummy file extents for the area we're creating a hole
3597 * for. So if we are truncating this file to a larger size we need to insert
3598 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3599 * the range between oldsize and size
3600 */
3601int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3516{ 3602{
3517 struct btrfs_trans_handle *trans; 3603 struct btrfs_trans_handle *trans;
3518 struct btrfs_root *root = BTRFS_I(inode)->root; 3604 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3520,7 +3606,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3520 struct extent_map *em = NULL; 3606 struct extent_map *em = NULL;
3521 struct extent_state *cached_state = NULL; 3607 struct extent_state *cached_state = NULL;
3522 u64 mask = root->sectorsize - 1; 3608 u64 mask = root->sectorsize - 1;
3523 u64 hole_start = (inode->i_size + mask) & ~mask; 3609 u64 hole_start = (oldsize + mask) & ~mask;
3524 u64 block_end = (size + mask) & ~mask; 3610 u64 block_end = (size + mask) & ~mask;
3525 u64 last_byte; 3611 u64 last_byte;
3526 u64 cur_offset; 3612 u64 cur_offset;
@@ -3565,13 +3651,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3565 err = btrfs_drop_extents(trans, inode, cur_offset, 3651 err = btrfs_drop_extents(trans, inode, cur_offset,
3566 cur_offset + hole_size, 3652 cur_offset + hole_size,
3567 &hint_byte, 1); 3653 &hint_byte, 1);
3568 BUG_ON(err); 3654 if (err)
3655 break;
3569 3656
3570 err = btrfs_insert_file_extent(trans, root, 3657 err = btrfs_insert_file_extent(trans, root,
3571 inode->i_ino, cur_offset, 0, 3658 inode->i_ino, cur_offset, 0,
3572 0, hole_size, 0, hole_size, 3659 0, hole_size, 0, hole_size,
3573 0, 0, 0); 3660 0, 0, 0);
3574 BUG_ON(err); 3661 if (err)
3662 break;
3575 3663
3576 btrfs_drop_extent_cache(inode, hole_start, 3664 btrfs_drop_extent_cache(inode, hole_start,
3577 last_byte - 1, 0); 3665 last_byte - 1, 0);
@@ -3591,94 +3679,58 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3591 return err; 3679 return err;
3592} 3680}
3593 3681
3594static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) 3682static int btrfs_setsize(struct inode *inode, loff_t newsize)
3595{ 3683{
3596 struct btrfs_root *root = BTRFS_I(inode)->root; 3684 loff_t oldsize = i_size_read(inode);
3597 struct btrfs_trans_handle *trans;
3598 unsigned long nr;
3599 int ret; 3685 int ret;
3600 3686
3601 if (attr->ia_size == inode->i_size) 3687 if (newsize == oldsize)
3602 return 0; 3688 return 0;
3603 3689
3604 if (attr->ia_size > inode->i_size) { 3690 if (newsize > oldsize) {
3605 unsigned long limit; 3691 i_size_write(inode, newsize);
3606 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 3692 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3607 if (attr->ia_size > inode->i_sb->s_maxbytes) 3693 truncate_pagecache(inode, oldsize, newsize);
3608 return -EFBIG; 3694 ret = btrfs_cont_expand(inode, oldsize, newsize);
3609 if (limit != RLIM_INFINITY && attr->ia_size > limit) {
3610 send_sig(SIGXFSZ, current, 0);
3611 return -EFBIG;
3612 }
3613 }
3614
3615 trans = btrfs_start_transaction(root, 5);
3616 if (IS_ERR(trans))
3617 return PTR_ERR(trans);
3618
3619 btrfs_set_trans_block_group(trans, inode);
3620
3621 ret = btrfs_orphan_add(trans, inode);
3622 BUG_ON(ret);
3623
3624 nr = trans->blocks_used;
3625 btrfs_end_transaction(trans, root);
3626 btrfs_btree_balance_dirty(root, nr);
3627
3628 if (attr->ia_size > inode->i_size) {
3629 ret = btrfs_cont_expand(inode, attr->ia_size);
3630 if (ret) { 3695 if (ret) {
3631 btrfs_truncate(inode); 3696 btrfs_setsize(inode, oldsize);
3632 return ret; 3697 return ret;
3633 } 3698 }
3634 3699
3635 i_size_write(inode, attr->ia_size); 3700 mark_inode_dirty(inode);
3636 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3701 } else {
3637 3702
3638 trans = btrfs_start_transaction(root, 0); 3703 /*
3639 BUG_ON(IS_ERR(trans)); 3704 * We're truncating a file that used to have good data down to
3640 btrfs_set_trans_block_group(trans, inode); 3705 * zero. Make sure it gets into the ordered flush list so that
3641 trans->block_rsv = root->orphan_block_rsv; 3706 * any new writes get down to disk quickly.
3642 BUG_ON(!trans->block_rsv); 3707 */
3708 if (newsize == 0)
3709 BTRFS_I(inode)->ordered_data_close = 1;
3643 3710
3644 ret = btrfs_update_inode(trans, root, inode); 3711 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3645 BUG_ON(ret); 3712 truncate_setsize(inode, newsize);
3646 if (inode->i_nlink > 0) { 3713 ret = btrfs_truncate(inode);
3647 ret = btrfs_orphan_del(trans, inode);
3648 BUG_ON(ret);
3649 }
3650 nr = trans->blocks_used;
3651 btrfs_end_transaction(trans, root);
3652 btrfs_btree_balance_dirty(root, nr);
3653 return 0;
3654 } 3714 }
3655 3715
3656 /* 3716 return ret;
3657 * We're truncating a file that used to have good data down to
3658 * zero. Make sure it gets into the ordered flush list so that
3659 * any new writes get down to disk quickly.
3660 */
3661 if (attr->ia_size == 0)
3662 BTRFS_I(inode)->ordered_data_close = 1;
3663
3664 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3665 ret = vmtruncate(inode, attr->ia_size);
3666 BUG_ON(ret);
3667
3668 return 0;
3669} 3717}
3670 3718
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3719static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3720{
3673 struct inode *inode = dentry->d_inode; 3721 struct inode *inode = dentry->d_inode;
3722 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3723 int err;
3675 3724
3725 if (btrfs_root_readonly(root))
3726 return -EROFS;
3727
3676 err = inode_change_ok(inode, attr); 3728 err = inode_change_ok(inode, attr);
3677 if (err) 3729 if (err)
3678 return err; 3730 return err;
3679 3731
3680 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3732 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3681 err = btrfs_setattr_size(inode, attr); 3733 err = btrfs_setsize(inode, attr->ia_size);
3682 if (err) 3734 if (err)
3683 return err; 3735 return err;
3684 } 3736 }
@@ -3701,6 +3753,8 @@ void btrfs_evict_inode(struct inode *inode)
3701 unsigned long nr; 3753 unsigned long nr;
3702 int ret; 3754 int ret;
3703 3755
3756 trace_btrfs_inode_evict(inode);
3757
3704 truncate_inode_pages(&inode->i_data, 0); 3758 truncate_inode_pages(&inode->i_data, 0);
3705 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3759 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3706 root == root->fs_info->tree_root)) 3760 root == root->fs_info->tree_root))
@@ -4043,7 +4097,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4043 BTRFS_I(inode)->root = root; 4097 BTRFS_I(inode)->root = root;
4044 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4098 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4045 btrfs_read_locked_inode(inode); 4099 btrfs_read_locked_inode(inode);
4046
4047 inode_tree_add(inode); 4100 inode_tree_add(inode);
4048 unlock_new_inode(inode); 4101 unlock_new_inode(inode);
4049 if (new) 4102 if (new)
@@ -4115,11 +4168,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4115 } 4168 }
4116 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4169 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4117 4170
4118 if (root != sub_root) { 4171 if (!IS_ERR(inode) && root != sub_root) {
4119 down_read(&root->fs_info->cleanup_work_sem); 4172 down_read(&root->fs_info->cleanup_work_sem);
4120 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4173 if (!(inode->i_sb->s_flags & MS_RDONLY))
4121 btrfs_orphan_cleanup(sub_root); 4174 ret = btrfs_orphan_cleanup(sub_root);
4122 up_read(&root->fs_info->cleanup_work_sem); 4175 up_read(&root->fs_info->cleanup_work_sem);
4176 if (ret)
4177 inode = ERR_PTR(ret);
4123 } 4178 }
4124 4179
4125 return inode; 4180 return inode;
@@ -4167,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4167 struct btrfs_key found_key; 4222 struct btrfs_key found_key;
4168 struct btrfs_path *path; 4223 struct btrfs_path *path;
4169 int ret; 4224 int ret;
4170 u32 nritems;
4171 struct extent_buffer *leaf; 4225 struct extent_buffer *leaf;
4172 int slot; 4226 int slot;
4173 int advance;
4174 unsigned char d_type; 4227 unsigned char d_type;
4175 int over = 0; 4228 int over = 0;
4176 u32 di_cur; 4229 u32 di_cur;
@@ -4213,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4213 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4266 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4214 if (ret < 0) 4267 if (ret < 0)
4215 goto err; 4268 goto err;
4216 advance = 0;
4217 4269
4218 while (1) { 4270 while (1) {
4219 leaf = path->nodes[0]; 4271 leaf = path->nodes[0];
4220 nritems = btrfs_header_nritems(leaf);
4221 slot = path->slots[0]; 4272 slot = path->slots[0];
4222 if (advance || slot >= nritems) { 4273 if (slot >= btrfs_header_nritems(leaf)) {
4223 if (slot >= nritems - 1) { 4274 ret = btrfs_next_leaf(root, path);
4224 ret = btrfs_next_leaf(root, path); 4275 if (ret < 0)
4225 if (ret) 4276 goto err;
4226 break; 4277 else if (ret > 0)
4227 leaf = path->nodes[0]; 4278 break;
4228 nritems = btrfs_header_nritems(leaf); 4279 continue;
4229 slot = path->slots[0];
4230 } else {
4231 slot++;
4232 path->slots[0]++;
4233 }
4234 } 4280 }
4235 4281
4236 advance = 1;
4237 item = btrfs_item_nr(leaf, slot); 4282 item = btrfs_item_nr(leaf, slot);
4238 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4283 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4239 4284
@@ -4242,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4242 if (btrfs_key_type(&found_key) != key_type) 4287 if (btrfs_key_type(&found_key) != key_type)
4243 break; 4288 break;
4244 if (found_key.offset < filp->f_pos) 4289 if (found_key.offset < filp->f_pos)
4245 continue; 4290 goto next;
4246 4291
4247 filp->f_pos = found_key.offset; 4292 filp->f_pos = found_key.offset;
4248 4293
@@ -4253,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4253 while (di_cur < di_total) { 4298 while (di_cur < di_total) {
4254 struct btrfs_key location; 4299 struct btrfs_key location;
4255 4300
4301 if (verify_dir_item(root, leaf, di))
4302 break;
4303
4256 name_len = btrfs_dir_name_len(leaf, di); 4304 name_len = btrfs_dir_name_len(leaf, di);
4257 if (name_len <= sizeof(tmp_name)) { 4305 if (name_len <= sizeof(tmp_name)) {
4258 name_ptr = tmp_name; 4306 name_ptr = tmp_name;
@@ -4292,6 +4340,8 @@ skip:
4292 di_cur += di_len; 4340 di_cur += di_len;
4293 di = (struct btrfs_dir_item *)((char *)di + di_len); 4341 di = (struct btrfs_dir_item *)((char *)di + di_len);
4294 } 4342 }
4343next:
4344 path->slots[0]++;
4295 } 4345 }
4296 4346
4297 /* Reached end of directory/root. Bump pos past the last item. */ 4347 /* Reached end of directory/root. Bump pos past the last item. */
@@ -4328,6 +4378,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4328 trans = btrfs_join_transaction_nolock(root, 1); 4378 trans = btrfs_join_transaction_nolock(root, 1);
4329 else 4379 else
4330 trans = btrfs_join_transaction(root, 1); 4380 trans = btrfs_join_transaction(root, 1);
4381 if (IS_ERR(trans))
4382 return PTR_ERR(trans);
4331 btrfs_set_trans_block_group(trans, inode); 4383 btrfs_set_trans_block_group(trans, inode);
4332 if (nolock) 4384 if (nolock)
4333 ret = btrfs_end_transaction_nolock(trans, root); 4385 ret = btrfs_end_transaction_nolock(trans, root);
@@ -4353,6 +4405,7 @@ void btrfs_dirty_inode(struct inode *inode)
4353 return; 4405 return;
4354 4406
4355 trans = btrfs_join_transaction(root, 1); 4407 trans = btrfs_join_transaction(root, 1);
4408 BUG_ON(IS_ERR(trans));
4356 btrfs_set_trans_block_group(trans, inode); 4409 btrfs_set_trans_block_group(trans, inode);
4357 4410
4358 ret = btrfs_update_inode(trans, root, inode); 4411 ret = btrfs_update_inode(trans, root, inode);
@@ -4481,12 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4481 BUG_ON(!path); 4534 BUG_ON(!path);
4482 4535
4483 inode = new_inode(root->fs_info->sb); 4536 inode = new_inode(root->fs_info->sb);
4484 if (!inode) 4537 if (!inode) {
4538 btrfs_free_path(path);
4485 return ERR_PTR(-ENOMEM); 4539 return ERR_PTR(-ENOMEM);
4540 }
4486 4541
4487 if (dir) { 4542 if (dir) {
4543 trace_btrfs_inode_request(dir);
4544
4488 ret = btrfs_set_inode_index(dir, index); 4545 ret = btrfs_set_inode_index(dir, index);
4489 if (ret) { 4546 if (ret) {
4547 btrfs_free_path(path);
4490 iput(inode); 4548 iput(inode);
4491 return ERR_PTR(ret); 4549 return ERR_PTR(ret);
4492 } 4550 }
@@ -4553,12 +4611,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4553 if ((mode & S_IFREG)) { 4611 if ((mode & S_IFREG)) {
4554 if (btrfs_test_opt(root, NODATASUM)) 4612 if (btrfs_test_opt(root, NODATASUM))
4555 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4613 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4556 if (btrfs_test_opt(root, NODATACOW)) 4614 if (btrfs_test_opt(root, NODATACOW) ||
4615 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4557 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4616 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4558 } 4617 }
4559 4618
4560 insert_inode_hash(inode); 4619 insert_inode_hash(inode);
4561 inode_tree_add(inode); 4620 inode_tree_add(inode);
4621
4622 trace_btrfs_inode_new(inode);
4623
4562 return inode; 4624 return inode;
4563fail: 4625fail:
4564 if (dir) 4626 if (dir)
@@ -4673,7 +4735,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4673 if (IS_ERR(inode)) 4735 if (IS_ERR(inode))
4674 goto out_unlock; 4736 goto out_unlock;
4675 4737
4676 err = btrfs_init_inode_security(trans, inode, dir); 4738 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4677 if (err) { 4739 if (err) {
4678 drop_inode = 1; 4740 drop_inode = 1;
4679 goto out_unlock; 4741 goto out_unlock;
@@ -4734,7 +4796,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4734 if (IS_ERR(inode)) 4796 if (IS_ERR(inode))
4735 goto out_unlock; 4797 goto out_unlock;
4736 4798
4737 err = btrfs_init_inode_security(trans, inode, dir); 4799 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4738 if (err) { 4800 if (err) {
4739 drop_inode = 1; 4801 drop_inode = 1;
4740 goto out_unlock; 4802 goto out_unlock;
@@ -4775,30 +4837,31 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4775 int err; 4837 int err;
4776 int drop_inode = 0; 4838 int drop_inode = 0;
4777 4839
4778 if (inode->i_nlink == 0)
4779 return -ENOENT;
4780
4781 /* do not allow sys_link's with other subvols of the same device */ 4840 /* do not allow sys_link's with other subvols of the same device */
4782 if (root->objectid != BTRFS_I(inode)->root->objectid) 4841 if (root->objectid != BTRFS_I(inode)->root->objectid)
4783 return -EPERM; 4842 return -EXDEV;
4784 4843
4785 btrfs_inc_nlink(inode); 4844 if (inode->i_nlink == ~0U)
4786 inode->i_ctime = CURRENT_TIME; 4845 return -EMLINK;
4787 4846
4788 err = btrfs_set_inode_index(dir, &index); 4847 err = btrfs_set_inode_index(dir, &index);
4789 if (err) 4848 if (err)
4790 goto fail; 4849 goto fail;
4791 4850
4792 /* 4851 /*
4793 * 1 item for inode ref 4852 * 2 items for inode and inode ref
4794 * 2 items for dir items 4853 * 2 items for dir items
4854 * 1 item for parent inode
4795 */ 4855 */
4796 trans = btrfs_start_transaction(root, 3); 4856 trans = btrfs_start_transaction(root, 5);
4797 if (IS_ERR(trans)) { 4857 if (IS_ERR(trans)) {
4798 err = PTR_ERR(trans); 4858 err = PTR_ERR(trans);
4799 goto fail; 4859 goto fail;
4800 } 4860 }
4801 4861
4862 btrfs_inc_nlink(inode);
4863 inode->i_ctime = CURRENT_TIME;
4864
4802 btrfs_set_trans_block_group(trans, dir); 4865 btrfs_set_trans_block_group(trans, dir);
4803 ihold(inode); 4866 ihold(inode);
4804 4867
@@ -4862,7 +4925,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4862 4925
4863 drop_on_err = 1; 4926 drop_on_err = 1;
4864 4927
4865 err = btrfs_init_inode_security(trans, inode, dir); 4928 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4866 if (err) 4929 if (err)
4867 goto out_fail; 4930 goto out_fail;
4868 4931
@@ -4928,8 +4991,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4928 size_t max_size; 4991 size_t max_size;
4929 unsigned long inline_size; 4992 unsigned long inline_size;
4930 unsigned long ptr; 4993 unsigned long ptr;
4994 int compress_type;
4931 4995
4932 WARN_ON(pg_offset != 0); 4996 WARN_ON(pg_offset != 0);
4997 compress_type = btrfs_file_extent_compression(leaf, item);
4933 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4998 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4934 inline_size = btrfs_file_extent_inline_item_len(leaf, 4999 inline_size = btrfs_file_extent_inline_item_len(leaf,
4935 btrfs_item_nr(leaf, path->slots[0])); 5000 btrfs_item_nr(leaf, path->slots[0]));
@@ -4939,8 +5004,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4939 read_extent_buffer(leaf, tmp, ptr, inline_size); 5004 read_extent_buffer(leaf, tmp, ptr, inline_size);
4940 5005
4941 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5006 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4942 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 5007 ret = btrfs_decompress(compress_type, tmp, page,
4943 inline_size, max_size); 5008 extent_offset, inline_size, max_size);
4944 if (ret) { 5009 if (ret) {
4945 char *kaddr = kmap_atomic(page, KM_USER0); 5010 char *kaddr = kmap_atomic(page, KM_USER0);
4946 unsigned long copy_size = min_t(u64, 5011 unsigned long copy_size = min_t(u64,
@@ -4982,7 +5047,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4982 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5047 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4983 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5048 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4984 struct btrfs_trans_handle *trans = NULL; 5049 struct btrfs_trans_handle *trans = NULL;
4985 int compressed; 5050 int compress_type;
4986 5051
4987again: 5052again:
4988 read_lock(&em_tree->lock); 5053 read_lock(&em_tree->lock);
@@ -5041,7 +5106,7 @@ again:
5041 5106
5042 found_type = btrfs_file_extent_type(leaf, item); 5107 found_type = btrfs_file_extent_type(leaf, item);
5043 extent_start = found_key.offset; 5108 extent_start = found_key.offset;
5044 compressed = btrfs_file_extent_compression(leaf, item); 5109 compress_type = btrfs_file_extent_compression(leaf, item);
5045 if (found_type == BTRFS_FILE_EXTENT_REG || 5110 if (found_type == BTRFS_FILE_EXTENT_REG ||
5046 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5111 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5047 extent_end = extent_start + 5112 extent_end = extent_start +
@@ -5087,8 +5152,9 @@ again:
5087 em->block_start = EXTENT_MAP_HOLE; 5152 em->block_start = EXTENT_MAP_HOLE;
5088 goto insert; 5153 goto insert;
5089 } 5154 }
5090 if (compressed) { 5155 if (compress_type != BTRFS_COMPRESS_NONE) {
5091 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5156 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5157 em->compress_type = compress_type;
5092 em->block_start = bytenr; 5158 em->block_start = bytenr;
5093 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5159 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5094 item); 5160 item);
@@ -5122,12 +5188,14 @@ again:
5122 em->len = (copy_size + root->sectorsize - 1) & 5188 em->len = (copy_size + root->sectorsize - 1) &
5123 ~((u64)root->sectorsize - 1); 5189 ~((u64)root->sectorsize - 1);
5124 em->orig_start = EXTENT_MAP_INLINE; 5190 em->orig_start = EXTENT_MAP_INLINE;
5125 if (compressed) 5191 if (compress_type) {
5126 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5192 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5193 em->compress_type = compress_type;
5194 }
5127 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5195 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5128 if (create == 0 && !PageUptodate(page)) { 5196 if (create == 0 && !PageUptodate(page)) {
5129 if (btrfs_file_extent_compression(leaf, item) == 5197 if (btrfs_file_extent_compression(leaf, item) !=
5130 BTRFS_COMPRESS_ZLIB) { 5198 BTRFS_COMPRESS_NONE) {
5131 ret = uncompress_inline(path, inode, page, 5199 ret = uncompress_inline(path, inode, page,
5132 pg_offset, 5200 pg_offset,
5133 extent_offset, item); 5201 extent_offset, item);
@@ -5152,6 +5220,8 @@ again:
5152 em = NULL; 5220 em = NULL;
5153 btrfs_release_path(root, path); 5221 btrfs_release_path(root, path);
5154 trans = btrfs_join_transaction(root, 1); 5222 trans = btrfs_join_transaction(root, 1);
5223 if (IS_ERR(trans))
5224 return ERR_CAST(trans);
5155 goto again; 5225 goto again;
5156 } 5226 }
5157 map = kmap(page); 5227 map = kmap(page);
@@ -5161,7 +5231,7 @@ again:
5161 btrfs_mark_buffer_dirty(leaf); 5231 btrfs_mark_buffer_dirty(leaf);
5162 } 5232 }
5163 set_extent_uptodate(io_tree, em->start, 5233 set_extent_uptodate(io_tree, em->start,
5164 extent_map_end(em) - 1, GFP_NOFS); 5234 extent_map_end(em) - 1, NULL, GFP_NOFS);
5165 goto insert; 5235 goto insert;
5166 } else { 5236 } else {
5167 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5237 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5228,6 +5298,9 @@ insert:
5228 } 5298 }
5229 write_unlock(&em_tree->lock); 5299 write_unlock(&em_tree->lock);
5230out: 5300out:
5301
5302 trace_btrfs_get_extent(root, em);
5303
5231 if (path) 5304 if (path)
5232 btrfs_free_path(path); 5305 btrfs_free_path(path);
5233 if (trans) { 5306 if (trans) {
@@ -5242,22 +5315,157 @@ out:
5242 return em; 5315 return em;
5243} 5316}
5244 5317
5318struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5319 size_t pg_offset, u64 start, u64 len,
5320 int create)
5321{
5322 struct extent_map *em;
5323 struct extent_map *hole_em = NULL;
5324 u64 range_start = start;
5325 u64 end;
5326 u64 found;
5327 u64 found_end;
5328 int err = 0;
5329
5330 em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5331 if (IS_ERR(em))
5332 return em;
5333 if (em) {
5334 /*
5335 * if our em maps to a hole, there might
5336 * actually be delalloc bytes behind it
5337 */
5338 if (em->block_start != EXTENT_MAP_HOLE)
5339 return em;
5340 else
5341 hole_em = em;
5342 }
5343
5344 /* check to see if we've wrapped (len == -1 or similar) */
5345 end = start + len;
5346 if (end < start)
5347 end = (u64)-1;
5348 else
5349 end -= 1;
5350
5351 em = NULL;
5352
5353 /* ok, we didn't find anything, lets look for delalloc */
5354 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5355 end, len, EXTENT_DELALLOC, 1);
5356 found_end = range_start + found;
5357 if (found_end < range_start)
5358 found_end = (u64)-1;
5359
5360 /*
5361 * we didn't find anything useful, return
5362 * the original results from get_extent()
5363 */
5364 if (range_start > end || found_end <= start) {
5365 em = hole_em;
5366 hole_em = NULL;
5367 goto out;
5368 }
5369
5370 /* adjust the range_start to make sure it doesn't
5371 * go backwards from the start they passed in
5372 */
5373 range_start = max(start,range_start);
5374 found = found_end - range_start;
5375
5376 if (found > 0) {
5377 u64 hole_start = start;
5378 u64 hole_len = len;
5379
5380 em = alloc_extent_map(GFP_NOFS);
5381 if (!em) {
5382 err = -ENOMEM;
5383 goto out;
5384 }
5385 /*
5386 * when btrfs_get_extent can't find anything it
5387 * returns one huge hole
5388 *
5389 * make sure what it found really fits our range, and
5390 * adjust to make sure it is based on the start from
5391 * the caller
5392 */
5393 if (hole_em) {
5394 u64 calc_end = extent_map_end(hole_em);
5395
5396 if (calc_end <= start || (hole_em->start > end)) {
5397 free_extent_map(hole_em);
5398 hole_em = NULL;
5399 } else {
5400 hole_start = max(hole_em->start, start);
5401 hole_len = calc_end - hole_start;
5402 }
5403 }
5404 em->bdev = NULL;
5405 if (hole_em && range_start > hole_start) {
5406 /* our hole starts before our delalloc, so we
5407 * have to return just the parts of the hole
5408 * that go until the delalloc starts
5409 */
5410 em->len = min(hole_len,
5411 range_start - hole_start);
5412 em->start = hole_start;
5413 em->orig_start = hole_start;
5414 /*
5415 * don't adjust block start at all,
5416 * it is fixed at EXTENT_MAP_HOLE
5417 */
5418 em->block_start = hole_em->block_start;
5419 em->block_len = hole_len;
5420 } else {
5421 em->start = range_start;
5422 em->len = found;
5423 em->orig_start = range_start;
5424 em->block_start = EXTENT_MAP_DELALLOC;
5425 em->block_len = found;
5426 }
5427 } else if (hole_em) {
5428 return hole_em;
5429 }
5430out:
5431
5432 free_extent_map(hole_em);
5433 if (err) {
5434 free_extent_map(em);
5435 return ERR_PTR(err);
5436 }
5437 return em;
5438}
5439
5245static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5440static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5441 struct extent_map *em,
5246 u64 start, u64 len) 5442 u64 start, u64 len)
5247{ 5443{
5248 struct btrfs_root *root = BTRFS_I(inode)->root; 5444 struct btrfs_root *root = BTRFS_I(inode)->root;
5249 struct btrfs_trans_handle *trans; 5445 struct btrfs_trans_handle *trans;
5250 struct extent_map *em;
5251 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5446 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5252 struct btrfs_key ins; 5447 struct btrfs_key ins;
5253 u64 alloc_hint; 5448 u64 alloc_hint;
5254 int ret; 5449 int ret;
5450 bool insert = false;
5255 5451
5256 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5452 /*
5453 * Ok if the extent map we looked up is a hole and is for the exact
5454 * range we want, there is no reason to allocate a new one, however if
5455 * it is not right then we need to free this one and drop the cache for
5456 * our range.
5457 */
5458 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5459 em->len != len) {
5460 free_extent_map(em);
5461 em = NULL;
5462 insert = true;
5463 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5464 }
5257 5465
5258 trans = btrfs_join_transaction(root, 0); 5466 trans = btrfs_join_transaction(root, 0);
5259 if (!trans) 5467 if (IS_ERR(trans))
5260 return ERR_PTR(-ENOMEM); 5468 return ERR_CAST(trans);
5261 5469
5262 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5470 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5263 5471
@@ -5269,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5269 goto out; 5477 goto out;
5270 } 5478 }
5271 5479
5272 em = alloc_extent_map(GFP_NOFS);
5273 if (!em) { 5480 if (!em) {
5274 em = ERR_PTR(-ENOMEM); 5481 em = alloc_extent_map(GFP_NOFS);
5275 goto out; 5482 if (!em) {
5483 em = ERR_PTR(-ENOMEM);
5484 goto out;
5485 }
5276 } 5486 }
5277 5487
5278 em->start = start; 5488 em->start = start;
@@ -5282,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5282 em->block_start = ins.objectid; 5492 em->block_start = ins.objectid;
5283 em->block_len = ins.offset; 5493 em->block_len = ins.offset;
5284 em->bdev = root->fs_info->fs_devices->latest_bdev; 5494 em->bdev = root->fs_info->fs_devices->latest_bdev;
5495
5496 /*
5497 * We need to do this because if we're using the original em we searched
5498 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5499 */
5500 em->flags = 0;
5285 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5501 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5286 5502
5287 while (1) { 5503 while (insert) {
5288 write_lock(&em_tree->lock); 5504 write_lock(&em_tree->lock);
5289 ret = add_extent_mapping(em_tree, em); 5505 ret = add_extent_mapping(em_tree, em);
5290 write_unlock(&em_tree->lock); 5506 write_unlock(&em_tree->lock);
@@ -5481,7 +5697,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5481 * while we look for nocow cross refs 5697 * while we look for nocow cross refs
5482 */ 5698 */
5483 trans = btrfs_join_transaction(root, 0); 5699 trans = btrfs_join_transaction(root, 0);
5484 if (!trans) 5700 if (IS_ERR(trans))
5485 goto must_cow; 5701 goto must_cow;
5486 5702
5487 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5703 if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5502,8 +5718,7 @@ must_cow:
5502 * it above 5718 * it above
5503 */ 5719 */
5504 len = bh_result->b_size; 5720 len = bh_result->b_size;
5505 free_extent_map(em); 5721 em = btrfs_new_extent_direct(inode, em, start, len);
5506 em = btrfs_new_extent_direct(inode, start, len);
5507 if (IS_ERR(em)) 5722 if (IS_ERR(em))
5508 return PTR_ERR(em); 5723 return PTR_ERR(em);
5509 len = min(len, em->len - (start - em->start)); 5724 len = min(len, em->len - (start - em->start));
@@ -5589,6 +5804,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5589 5804
5590 kfree(dip->csums); 5805 kfree(dip->csums);
5591 kfree(dip); 5806 kfree(dip);
5807
5808 /* If we had a csum failure make sure to clear the uptodate flag */
5809 if (err)
5810 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5592 dio_end_io(bio, err); 5811 dio_end_io(bio, err);
5593} 5812}
5594 5813
@@ -5616,7 +5835,7 @@ again:
5616 BUG_ON(!ordered); 5835 BUG_ON(!ordered);
5617 5836
5618 trans = btrfs_join_transaction(root, 1); 5837 trans = btrfs_join_transaction(root, 1);
5619 if (!trans) { 5838 if (IS_ERR(trans)) {
5620 err = -ENOMEM; 5839 err = -ENOMEM;
5621 goto out; 5840 goto out;
5622 } 5841 }
@@ -5662,8 +5881,10 @@ again:
5662 } 5881 }
5663 5882
5664 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5883 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5665 btrfs_ordered_update_i_size(inode, 0, ordered); 5884 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5666 btrfs_update_inode(trans, root, inode); 5885 if (!ret)
5886 btrfs_update_inode(trans, root, inode);
5887 ret = 0;
5667out_unlock: 5888out_unlock:
5668 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5889 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5669 ordered->file_offset + ordered->len - 1, 5890 ordered->file_offset + ordered->len - 1,
@@ -5690,6 +5911,10 @@ out_done:
5690 5911
5691 kfree(dip->csums); 5912 kfree(dip->csums);
5692 kfree(dip); 5913 kfree(dip);
5914
5915 /* If we had an error make sure to clear the uptodate flag */
5916 if (err)
5917 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5693 dio_end_io(bio, err); 5918 dio_end_io(bio, err);
5694} 5919}
5695 5920
@@ -5745,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5745 5970
5746static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5971static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5747 int rw, u64 file_offset, int skip_sum, 5972 int rw, u64 file_offset, int skip_sum,
5748 u32 *csums) 5973 u32 *csums, int async_submit)
5749{ 5974{
5750 int write = rw & REQ_WRITE; 5975 int write = rw & REQ_WRITE;
5751 struct btrfs_root *root = BTRFS_I(inode)->root; 5976 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5756,18 +5981,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5756 if (ret) 5981 if (ret)
5757 goto err; 5982 goto err;
5758 5983
5759 if (write && !skip_sum) { 5984 if (skip_sum)
5985 goto map;
5986
5987 if (write && async_submit) {
5760 ret = btrfs_wq_submit_bio(root->fs_info, 5988 ret = btrfs_wq_submit_bio(root->fs_info,
5761 inode, rw, bio, 0, 0, 5989 inode, rw, bio, 0, 0,
5762 file_offset, 5990 file_offset,
5763 __btrfs_submit_bio_start_direct_io, 5991 __btrfs_submit_bio_start_direct_io,
5764 __btrfs_submit_bio_done); 5992 __btrfs_submit_bio_done);
5765 goto err; 5993 goto err;
5766 } else if (!skip_sum) 5994 } else if (write) {
5767 btrfs_lookup_bio_sums_dio(root, inode, bio, 5995 /*
5996 * If we aren't doing async submit, calculate the csum of the
5997 * bio now.
5998 */
5999 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6000 if (ret)
6001 goto err;
6002 } else if (!skip_sum) {
6003 ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
5768 file_offset, csums); 6004 file_offset, csums);
6005 if (ret)
6006 goto err;
6007 }
5769 6008
5770 ret = btrfs_map_bio(root, rw, bio, 0, 1); 6009map:
6010 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
5771err: 6011err:
5772 bio_put(bio); 6012 bio_put(bio);
5773 return ret; 6013 return ret;
@@ -5789,13 +6029,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5789 int nr_pages = 0; 6029 int nr_pages = 0;
5790 u32 *csums = dip->csums; 6030 u32 *csums = dip->csums;
5791 int ret = 0; 6031 int ret = 0;
5792 6032 int async_submit = 0;
5793 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6033 int write = rw & REQ_WRITE;
5794 if (!bio)
5795 return -ENOMEM;
5796 bio->bi_private = dip;
5797 bio->bi_end_io = btrfs_end_dio_bio;
5798 atomic_inc(&dip->pending_bios);
5799 6034
5800 map_length = orig_bio->bi_size; 6035 map_length = orig_bio->bi_size;
5801 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6036 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -5805,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5805 return -EIO; 6040 return -EIO;
5806 } 6041 }
5807 6042
6043 if (map_length >= orig_bio->bi_size) {
6044 bio = orig_bio;
6045 goto submit;
6046 }
6047
6048 async_submit = 1;
6049 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6050 if (!bio)
6051 return -ENOMEM;
6052 bio->bi_private = dip;
6053 bio->bi_end_io = btrfs_end_dio_bio;
6054 atomic_inc(&dip->pending_bios);
6055
5808 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6056 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5809 if (unlikely(map_length < submit_len + bvec->bv_len || 6057 if (unlikely(map_length < submit_len + bvec->bv_len ||
5810 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6058 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5818,14 +6066,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5818 atomic_inc(&dip->pending_bios); 6066 atomic_inc(&dip->pending_bios);
5819 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6067 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5820 file_offset, skip_sum, 6068 file_offset, skip_sum,
5821 csums); 6069 csums, async_submit);
5822 if (ret) { 6070 if (ret) {
5823 bio_put(bio); 6071 bio_put(bio);
5824 atomic_dec(&dip->pending_bios); 6072 atomic_dec(&dip->pending_bios);
5825 goto out_err; 6073 goto out_err;
5826 } 6074 }
5827 6075
5828 if (!skip_sum) 6076 /* Write's use the ordered csums */
6077 if (!write && !skip_sum)
5829 csums = csums + nr_pages; 6078 csums = csums + nr_pages;
5830 start_sector += submit_len >> 9; 6079 start_sector += submit_len >> 9;
5831 file_offset += submit_len; 6080 file_offset += submit_len;
@@ -5854,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5854 } 6103 }
5855 } 6104 }
5856 6105
6106submit:
5857 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6107 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5858 csums); 6108 csums, async_submit);
5859 if (!ret) 6109 if (!ret)
5860 return 0; 6110 return 0;
5861 6111
@@ -5893,9 +6143,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5893 } 6143 }
5894 dip->csums = NULL; 6144 dip->csums = NULL;
5895 6145
5896 if (!skip_sum) { 6146 /* Write's use the ordered csum stuff, so we don't need dip->csums */
6147 if (!write && !skip_sum) {
5897 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6148 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5898 if (!dip->csums) { 6149 if (!dip->csums) {
6150 kfree(dip);
5899 ret = -ENOMEM; 6151 ret = -ENOMEM;
5900 goto free_ordered; 6152 goto free_ordered;
5901 } 6153 }
@@ -5948,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5948 unsigned long nr_segs) 6200 unsigned long nr_segs)
5949{ 6201{
5950 int seg; 6202 int seg;
6203 int i;
5951 size_t size; 6204 size_t size;
5952 unsigned long addr; 6205 unsigned long addr;
5953 unsigned blocksize_mask = root->sectorsize - 1; 6206 unsigned blocksize_mask = root->sectorsize - 1;
@@ -5962,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5962 addr = (unsigned long)iov[seg].iov_base; 6215 addr = (unsigned long)iov[seg].iov_base;
5963 size = iov[seg].iov_len; 6216 size = iov[seg].iov_len;
5964 end += size; 6217 end += size;
5965 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6218 if ((addr & blocksize_mask) || (size & blocksize_mask))
5966 goto out; 6219 goto out;
6220
6221 /* If this is a write we don't need to check anymore */
6222 if (rw & WRITE)
6223 continue;
6224
6225 /*
6226 * Check to make sure we don't have duplicate iov_base's in this
6227 * iovec, if so return EINVAL, otherwise we'll get csum errors
6228 * when reading back.
6229 */
6230 for (i = seg + 1; i < nr_segs; i++) {
6231 if (iov[seg].iov_base == iov[i].iov_base)
6232 goto out;
6233 }
5967 } 6234 }
5968 retval = 0; 6235 retval = 0;
5969out: 6236out:
@@ -6064,7 +6331,7 @@ out:
6064static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6331static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6065 __u64 start, __u64 len) 6332 __u64 start, __u64 len)
6066{ 6333{
6067 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 6334 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6068} 6335}
6069 6336
6070int btrfs_readpage(struct file *file, struct page *page) 6337int btrfs_readpage(struct file *file, struct page *page)
@@ -6314,28 +6581,42 @@ out:
6314 return ret; 6581 return ret;
6315} 6582}
6316 6583
6317static void btrfs_truncate(struct inode *inode) 6584static int btrfs_truncate(struct inode *inode)
6318{ 6585{
6319 struct btrfs_root *root = BTRFS_I(inode)->root; 6586 struct btrfs_root *root = BTRFS_I(inode)->root;
6320 int ret; 6587 int ret;
6588 int err = 0;
6321 struct btrfs_trans_handle *trans; 6589 struct btrfs_trans_handle *trans;
6322 unsigned long nr; 6590 unsigned long nr;
6323 u64 mask = root->sectorsize - 1; 6591 u64 mask = root->sectorsize - 1;
6324 6592
6325 if (!S_ISREG(inode->i_mode)) {
6326 WARN_ON(1);
6327 return;
6328 }
6329
6330 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6593 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6331 if (ret) 6594 if (ret)
6332 return; 6595 return ret;
6333 6596
6334 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6597 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6335 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6598 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6336 6599
6600 trans = btrfs_start_transaction(root, 5);
6601 if (IS_ERR(trans))
6602 return PTR_ERR(trans);
6603
6604 btrfs_set_trans_block_group(trans, inode);
6605
6606 ret = btrfs_orphan_add(trans, inode);
6607 if (ret) {
6608 btrfs_end_transaction(trans, root);
6609 return ret;
6610 }
6611
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /* Now start a transaction for the truncate */
6337 trans = btrfs_start_transaction(root, 0); 6617 trans = btrfs_start_transaction(root, 0);
6338 BUG_ON(IS_ERR(trans)); 6618 if (IS_ERR(trans))
6619 return PTR_ERR(trans);
6339 btrfs_set_trans_block_group(trans, inode); 6620 btrfs_set_trans_block_group(trans, inode);
6340 trans->block_rsv = root->orphan_block_rsv; 6621 trans->block_rsv = root->orphan_block_rsv;
6341 6622
@@ -6362,29 +6643,38 @@ static void btrfs_truncate(struct inode *inode)
6362 while (1) { 6643 while (1) {
6363 if (!trans) { 6644 if (!trans) {
6364 trans = btrfs_start_transaction(root, 0); 6645 trans = btrfs_start_transaction(root, 0);
6365 BUG_ON(IS_ERR(trans)); 6646 if (IS_ERR(trans))
6647 return PTR_ERR(trans);
6366 btrfs_set_trans_block_group(trans, inode); 6648 btrfs_set_trans_block_group(trans, inode);
6367 trans->block_rsv = root->orphan_block_rsv; 6649 trans->block_rsv = root->orphan_block_rsv;
6368 } 6650 }
6369 6651
6370 ret = btrfs_block_rsv_check(trans, root, 6652 ret = btrfs_block_rsv_check(trans, root,
6371 root->orphan_block_rsv, 0, 5); 6653 root->orphan_block_rsv, 0, 5);
6372 if (ret) { 6654 if (ret == -EAGAIN) {
6373 BUG_ON(ret != -EAGAIN);
6374 ret = btrfs_commit_transaction(trans, root); 6655 ret = btrfs_commit_transaction(trans, root);
6375 BUG_ON(ret); 6656 if (ret)
6657 return ret;
6376 trans = NULL; 6658 trans = NULL;
6377 continue; 6659 continue;
6660 } else if (ret) {
6661 err = ret;
6662 break;
6378 } 6663 }
6379 6664
6380 ret = btrfs_truncate_inode_items(trans, root, inode, 6665 ret = btrfs_truncate_inode_items(trans, root, inode,
6381 inode->i_size, 6666 inode->i_size,
6382 BTRFS_EXTENT_DATA_KEY); 6667 BTRFS_EXTENT_DATA_KEY);
6383 if (ret != -EAGAIN) 6668 if (ret != -EAGAIN) {
6669 err = ret;
6384 break; 6670 break;
6671 }
6385 6672
6386 ret = btrfs_update_inode(trans, root, inode); 6673 ret = btrfs_update_inode(trans, root, inode);
6387 BUG_ON(ret); 6674 if (ret) {
6675 err = ret;
6676 break;
6677 }
6388 6678
6389 nr = trans->blocks_used; 6679 nr = trans->blocks_used;
6390 btrfs_end_transaction(trans, root); 6680 btrfs_end_transaction(trans, root);
@@ -6394,16 +6684,27 @@ static void btrfs_truncate(struct inode *inode)
6394 6684
6395 if (ret == 0 && inode->i_nlink > 0) { 6685 if (ret == 0 && inode->i_nlink > 0) {
6396 ret = btrfs_orphan_del(trans, inode); 6686 ret = btrfs_orphan_del(trans, inode);
6397 BUG_ON(ret); 6687 if (ret)
6688 err = ret;
6689 } else if (ret && inode->i_nlink > 0) {
6690 /*
6691 * Failed to do the truncate, remove us from the in memory
6692 * orphan list.
6693 */
6694 ret = btrfs_orphan_del(NULL, inode);
6398 } 6695 }
6399 6696
6400 ret = btrfs_update_inode(trans, root, inode); 6697 ret = btrfs_update_inode(trans, root, inode);
6401 BUG_ON(ret); 6698 if (ret && !err)
6699 err = ret;
6402 6700
6403 nr = trans->blocks_used; 6701 nr = trans->blocks_used;
6404 ret = btrfs_end_transaction_throttle(trans, root); 6702 ret = btrfs_end_transaction_throttle(trans, root);
6405 BUG_ON(ret); 6703 if (ret && !err)
6704 err = ret;
6406 btrfs_btree_balance_dirty(root, nr); 6705 btrfs_btree_balance_dirty(root, nr);
6706
6707 return err;
6407} 6708}
6408 6709
6409/* 6710/*
@@ -6470,14 +6771,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6470 ei->index_cnt = (u64)-1; 6771 ei->index_cnt = (u64)-1;
6471 ei->last_unlink_trans = 0; 6772 ei->last_unlink_trans = 0;
6472 6773
6473 spin_lock_init(&ei->accounting_lock);
6474 atomic_set(&ei->outstanding_extents, 0); 6774 atomic_set(&ei->outstanding_extents, 0);
6475 ei->reserved_extents = 0; 6775 atomic_set(&ei->reserved_extents, 0);
6476 6776
6477 ei->ordered_data_close = 0; 6777 ei->ordered_data_close = 0;
6478 ei->orphan_meta_reserved = 0; 6778 ei->orphan_meta_reserved = 0;
6479 ei->dummy_inode = 0; 6779 ei->dummy_inode = 0;
6480 ei->force_compress = 0; 6780 ei->force_compress = BTRFS_COMPRESS_NONE;
6481 6781
6482 inode = &ei->vfs_inode; 6782 inode = &ei->vfs_inode;
6483 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6783 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -6508,7 +6808,7 @@ void btrfs_destroy_inode(struct inode *inode)
6508 WARN_ON(!list_empty(&inode->i_dentry)); 6808 WARN_ON(!list_empty(&inode->i_dentry));
6509 WARN_ON(inode->i_data.nrpages); 6809 WARN_ON(inode->i_data.nrpages);
6510 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6810 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6511 WARN_ON(BTRFS_I(inode)->reserved_extents); 6811 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
6512 6812
6513 /* 6813 /*
6514 * This can happen where we create an inode, but somebody else also 6814 * This can happen where we create an inode, but somebody else also
@@ -6600,6 +6900,8 @@ void btrfs_destroy_cachep(void)
6600 kmem_cache_destroy(btrfs_transaction_cachep); 6900 kmem_cache_destroy(btrfs_transaction_cachep);
6601 if (btrfs_path_cachep) 6901 if (btrfs_path_cachep)
6602 kmem_cache_destroy(btrfs_path_cachep); 6902 kmem_cache_destroy(btrfs_path_cachep);
6903 if (btrfs_free_space_cachep)
6904 kmem_cache_destroy(btrfs_free_space_cachep);
6603} 6905}
6604 6906
6605int btrfs_init_cachep(void) 6907int btrfs_init_cachep(void)
@@ -6628,6 +6930,12 @@ int btrfs_init_cachep(void)
6628 if (!btrfs_path_cachep) 6930 if (!btrfs_path_cachep)
6629 goto fail; 6931 goto fail;
6630 6932
6933 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
6934 sizeof(struct btrfs_free_space), 0,
6935 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6936 if (!btrfs_free_space_cachep)
6937 goto fail;
6938
6631 return 0; 6939 return 0;
6632fail: 6940fail:
6633 btrfs_destroy_cachep(); 6941 btrfs_destroy_cachep();
@@ -6646,6 +6954,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
6646 return 0; 6954 return 0;
6647} 6955}
6648 6956
6957/*
6958 * If a file is moved, it will inherit the cow and compression flags of the new
6959 * directory.
6960 */
6961static void fixup_inode_flags(struct inode *dir, struct inode *inode)
6962{
6963 struct btrfs_inode *b_dir = BTRFS_I(dir);
6964 struct btrfs_inode *b_inode = BTRFS_I(inode);
6965
6966 if (b_dir->flags & BTRFS_INODE_NODATACOW)
6967 b_inode->flags |= BTRFS_INODE_NODATACOW;
6968 else
6969 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
6970
6971 if (b_dir->flags & BTRFS_INODE_COMPRESS)
6972 b_inode->flags |= BTRFS_INODE_COMPRESS;
6973 else
6974 b_inode->flags &= ~BTRFS_INODE_COMPRESS;
6975}
6976
6649static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6977static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6650 struct inode *new_dir, struct dentry *new_dentry) 6978 struct inode *new_dir, struct dentry *new_dentry)
6651{ 6979{
@@ -6694,8 +7022,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6694 * should cover the worst case number of items we'll modify. 7022 * should cover the worst case number of items we'll modify.
6695 */ 7023 */
6696 trans = btrfs_start_transaction(root, 20); 7024 trans = btrfs_start_transaction(root, 20);
6697 if (IS_ERR(trans)) 7025 if (IS_ERR(trans)) {
6698 return PTR_ERR(trans); 7026 ret = PTR_ERR(trans);
7027 goto out_notrans;
7028 }
6699 7029
6700 btrfs_set_trans_block_group(trans, new_dir); 7030 btrfs_set_trans_block_group(trans, new_dir);
6701 7031
@@ -6748,11 +7078,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6748 old_dentry->d_name.name, 7078 old_dentry->d_name.name,
6749 old_dentry->d_name.len); 7079 old_dentry->d_name.len);
6750 } else { 7080 } else {
6751 btrfs_inc_nlink(old_dentry->d_inode); 7081 ret = __btrfs_unlink_inode(trans, root, old_dir,
6752 ret = btrfs_unlink_inode(trans, root, old_dir, 7082 old_dentry->d_inode,
6753 old_dentry->d_inode, 7083 old_dentry->d_name.name,
6754 old_dentry->d_name.name, 7084 old_dentry->d_name.len);
6755 old_dentry->d_name.len); 7085 if (!ret)
7086 ret = btrfs_update_inode(trans, root, old_inode);
6756 } 7087 }
6757 BUG_ON(ret); 7088 BUG_ON(ret);
6758 7089
@@ -6779,6 +7110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6779 } 7110 }
6780 } 7111 }
6781 7112
7113 fixup_inode_flags(new_dir, old_inode);
7114
6782 ret = btrfs_add_link(trans, new_dir, old_inode, 7115 ret = btrfs_add_link(trans, new_dir, old_inode,
6783 new_dentry->d_name.name, 7116 new_dentry->d_name.name,
6784 new_dentry->d_name.len, 0, index); 7117 new_dentry->d_name.len, 0, index);
@@ -6792,7 +7125,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6792 } 7125 }
6793out_fail: 7126out_fail:
6794 btrfs_end_transaction_throttle(trans, root); 7127 btrfs_end_transaction_throttle(trans, root);
6795 7128out_notrans:
6796 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7129 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
6797 up_read(&root->fs_info->subvol_sem); 7130 up_read(&root->fs_info->subvol_sem);
6798 7131
@@ -6944,7 +7277,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6944 if (IS_ERR(inode)) 7277 if (IS_ERR(inode))
6945 goto out_unlock; 7278 goto out_unlock;
6946 7279
6947 err = btrfs_init_inode_security(trans, inode, dir); 7280 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6948 if (err) { 7281 if (err) {
6949 drop_inode = 1; 7282 drop_inode = 1;
6950 goto out_unlock; 7283 goto out_unlock;
@@ -7098,116 +7431,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
7098 min_size, actual_len, alloc_hint, trans); 7431 min_size, actual_len, alloc_hint, trans);
7099} 7432}
7100 7433
7101static long btrfs_fallocate(struct inode *inode, int mode,
7102 loff_t offset, loff_t len)
7103{
7104 struct extent_state *cached_state = NULL;
7105 u64 cur_offset;
7106 u64 last_byte;
7107 u64 alloc_start;
7108 u64 alloc_end;
7109 u64 alloc_hint = 0;
7110 u64 locked_end;
7111 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
7112 struct extent_map *em;
7113 int ret;
7114
7115 alloc_start = offset & ~mask;
7116 alloc_end = (offset + len + mask) & ~mask;
7117
7118 /* We only support the FALLOC_FL_KEEP_SIZE mode */
7119 if (mode && (mode != FALLOC_FL_KEEP_SIZE))
7120 return -EOPNOTSUPP;
7121
7122 /*
7123 * wait for ordered IO before we have any locks. We'll loop again
7124 * below with the locks held.
7125 */
7126 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
7127
7128 mutex_lock(&inode->i_mutex);
7129 ret = inode_newsize_ok(inode, alloc_end);
7130 if (ret)
7131 goto out;
7132
7133 if (alloc_start > inode->i_size) {
7134 ret = btrfs_cont_expand(inode, alloc_start);
7135 if (ret)
7136 goto out;
7137 }
7138
7139 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
7140 if (ret)
7141 goto out;
7142
7143 locked_end = alloc_end - 1;
7144 while (1) {
7145 struct btrfs_ordered_extent *ordered;
7146
7147 /* the extent lock is ordered inside the running
7148 * transaction
7149 */
7150 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
7151 locked_end, 0, &cached_state, GFP_NOFS);
7152 ordered = btrfs_lookup_first_ordered_extent(inode,
7153 alloc_end - 1);
7154 if (ordered &&
7155 ordered->file_offset + ordered->len > alloc_start &&
7156 ordered->file_offset < alloc_end) {
7157 btrfs_put_ordered_extent(ordered);
7158 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7159 alloc_start, locked_end,
7160 &cached_state, GFP_NOFS);
7161 /*
7162 * we can't wait on the range with the transaction
7163 * running or with the extent lock held
7164 */
7165 btrfs_wait_ordered_range(inode, alloc_start,
7166 alloc_end - alloc_start);
7167 } else {
7168 if (ordered)
7169 btrfs_put_ordered_extent(ordered);
7170 break;
7171 }
7172 }
7173
7174 cur_offset = alloc_start;
7175 while (1) {
7176 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
7177 alloc_end - cur_offset, 0);
7178 BUG_ON(IS_ERR(em) || !em);
7179 last_byte = min(extent_map_end(em), alloc_end);
7180 last_byte = (last_byte + mask) & ~mask;
7181 if (em->block_start == EXTENT_MAP_HOLE ||
7182 (cur_offset >= inode->i_size &&
7183 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7184 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
7185 last_byte - cur_offset,
7186 1 << inode->i_blkbits,
7187 offset + len,
7188 &alloc_hint);
7189 if (ret < 0) {
7190 free_extent_map(em);
7191 break;
7192 }
7193 }
7194 free_extent_map(em);
7195
7196 cur_offset = last_byte;
7197 if (cur_offset >= alloc_end) {
7198 ret = 0;
7199 break;
7200 }
7201 }
7202 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7203 &cached_state, GFP_NOFS);
7204
7205 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7206out:
7207 mutex_unlock(&inode->i_mutex);
7208 return ret;
7209}
7210
7211static int btrfs_set_page_dirty(struct page *page) 7434static int btrfs_set_page_dirty(struct page *page)
7212{ 7435{
7213 return __set_page_dirty_nobuffers(page); 7436 return __set_page_dirty_nobuffers(page);
@@ -7215,6 +7438,10 @@ static int btrfs_set_page_dirty(struct page *page)
7215 7438
7216static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7439static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7217{ 7440{
7441 struct btrfs_root *root = BTRFS_I(inode)->root;
7442
7443 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7444 return -EROFS;
7218 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7445 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7219 return -EACCES; 7446 return -EACCES;
7220 return generic_permission(inode, mask, flags, btrfs_check_acl); 7447 return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7286,7 +7513,6 @@ static const struct address_space_operations btrfs_aops = {
7286 .writepage = btrfs_writepage, 7513 .writepage = btrfs_writepage,
7287 .writepages = btrfs_writepages, 7514 .writepages = btrfs_writepages,
7288 .readpages = btrfs_readpages, 7515 .readpages = btrfs_readpages,
7289 .sync_page = block_sync_page,
7290 .direct_IO = btrfs_direct_IO, 7516 .direct_IO = btrfs_direct_IO,
7291 .invalidatepage = btrfs_invalidatepage, 7517 .invalidatepage = btrfs_invalidatepage,
7292 .releasepage = btrfs_releasepage, 7518 .releasepage = btrfs_releasepage,
@@ -7302,7 +7528,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
7302}; 7528};
7303 7529
7304static const struct inode_operations btrfs_file_inode_operations = { 7530static const struct inode_operations btrfs_file_inode_operations = {
7305 .truncate = btrfs_truncate,
7306 .getattr = btrfs_getattr, 7531 .getattr = btrfs_getattr,
7307 .setattr = btrfs_setattr, 7532 .setattr = btrfs_setattr,
7308 .setxattr = btrfs_setxattr, 7533 .setxattr = btrfs_setxattr,
@@ -7310,7 +7535,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7310 .listxattr = btrfs_listxattr, 7535 .listxattr = btrfs_listxattr,
7311 .removexattr = btrfs_removexattr, 7536 .removexattr = btrfs_removexattr,
7312 .permission = btrfs_permission, 7537 .permission = btrfs_permission,
7313 .fallocate = btrfs_fallocate,
7314 .fiemap = btrfs_fiemap, 7538 .fiemap = btrfs_fiemap,
7315}; 7539};
7316static const struct inode_operations btrfs_special_inode_operations = { 7540static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h>
43#include "compat.h" 44#include "compat.h"
44#include "ctree.h" 45#include "ctree.h"
45#include "disk-io.h" 46#include "disk-io.h"
@@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
138 return 0; 139 return 0;
139} 140}
140 141
142static int check_flags(unsigned int flags)
143{
144 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
145 FS_NOATIME_FL | FS_NODUMP_FL | \
146 FS_SYNC_FL | FS_DIRSYNC_FL | \
147 FS_NOCOMP_FL | FS_COMPR_FL | \
148 FS_NOCOW_FL | FS_COW_FL))
149 return -EOPNOTSUPP;
150
151 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
152 return -EINVAL;
153
154 if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL))
155 return -EINVAL;
156
157 return 0;
158}
159
141static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 160static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142{ 161{
143 struct inode *inode = file->f_path.dentry->d_inode; 162 struct inode *inode = file->f_path.dentry->d_inode;
@@ -147,15 +166,17 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 166 unsigned int flags, oldflags;
148 int ret; 167 int ret;
149 168
169 if (btrfs_root_readonly(root))
170 return -EROFS;
171
150 if (copy_from_user(&flags, arg, sizeof(flags))) 172 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 173 return -EFAULT;
152 174
153 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 175 ret = check_flags(flags);
154 FS_NOATIME_FL | FS_NODUMP_FL | \ 176 if (ret)
155 FS_SYNC_FL | FS_DIRSYNC_FL)) 177 return ret;
156 return -EOPNOTSUPP;
157 178
158 if (!is_owner_or_cap(inode)) 179 if (!inode_owner_or_capable(inode))
159 return -EACCES; 180 return -EACCES;
160 181
161 mutex_lock(&inode->i_mutex); 182 mutex_lock(&inode->i_mutex);
@@ -198,9 +219,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
198 else 219 else
199 ip->flags &= ~BTRFS_INODE_DIRSYNC; 220 ip->flags &= ~BTRFS_INODE_DIRSYNC;
200 221
222 /*
223 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
224 * flag may be changed automatically if compression code won't make
225 * things smaller.
226 */
227 if (flags & FS_NOCOMP_FL) {
228 ip->flags &= ~BTRFS_INODE_COMPRESS;
229 ip->flags |= BTRFS_INODE_NOCOMPRESS;
230 } else if (flags & FS_COMPR_FL) {
231 ip->flags |= BTRFS_INODE_COMPRESS;
232 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
233 }
234 if (flags & FS_NOCOW_FL)
235 ip->flags |= BTRFS_INODE_NODATACOW;
236 else if (flags & FS_COW_FL)
237 ip->flags &= ~BTRFS_INODE_NODATACOW;
201 238
202 trans = btrfs_join_transaction(root, 1); 239 trans = btrfs_join_transaction(root, 1);
203 BUG_ON(!trans); 240 BUG_ON(IS_ERR(trans));
204 241
205 ret = btrfs_update_inode(trans, root, inode); 242 ret = btrfs_update_inode(trans, root, inode);
206 BUG_ON(ret); 243 BUG_ON(ret);
@@ -210,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
210 btrfs_end_transaction(trans, root); 247 btrfs_end_transaction(trans, root);
211 248
212 mnt_drop_write(file->f_path.mnt); 249 mnt_drop_write(file->f_path.mnt);
250
251 ret = 0;
213 out_unlock: 252 out_unlock:
214 mutex_unlock(&inode->i_mutex); 253 mutex_unlock(&inode->i_mutex);
215 return 0; 254 return ret;
216} 255}
217 256
218static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 257static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -222,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
222 return put_user(inode->i_generation, arg); 261 return put_user(inode->i_generation, arg);
223} 262}
224 263
264static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
265{
266 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
267 struct btrfs_fs_info *fs_info = root->fs_info;
268 struct btrfs_device *device;
269 struct request_queue *q;
270 struct fstrim_range range;
271 u64 minlen = ULLONG_MAX;
272 u64 num_devices = 0;
273 int ret;
274
275 if (!capable(CAP_SYS_ADMIN))
276 return -EPERM;
277
278 mutex_lock(&fs_info->fs_devices->device_list_mutex);
279 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
280 if (!device->bdev)
281 continue;
282 q = bdev_get_queue(device->bdev);
283 if (blk_queue_discard(q)) {
284 num_devices++;
285 minlen = min((u64)q->limits.discard_granularity,
286 minlen);
287 }
288 }
289 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
290 if (!num_devices)
291 return -EOPNOTSUPP;
292
293 if (copy_from_user(&range, arg, sizeof(range)))
294 return -EFAULT;
295
296 range.minlen = max(range.minlen, minlen);
297 ret = btrfs_trim_fs(root, &range);
298 if (ret < 0)
299 return ret;
300
301 if (copy_to_user(arg, &range, sizeof(range)))
302 return -EFAULT;
303
304 return 0;
305}
306
225static noinline int create_subvol(struct btrfs_root *root, 307static noinline int create_subvol(struct btrfs_root *root,
226 struct dentry *dentry, 308 struct dentry *dentry,
227 char *name, int namelen, 309 char *name, int namelen,
@@ -291,6 +373,10 @@ static noinline int create_subvol(struct btrfs_root *root,
291 inode_item->nbytes = cpu_to_le64(root->leafsize); 373 inode_item->nbytes = cpu_to_le64(root->leafsize);
292 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 374 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
293 375
376 root_item.flags = 0;
377 root_item.byte_limit = 0;
378 inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
379
294 btrfs_set_root_bytenr(&root_item, leaf->start); 380 btrfs_set_root_bytenr(&root_item, leaf->start);
295 btrfs_set_root_generation(&root_item, trans->transid); 381 btrfs_set_root_generation(&root_item, trans->transid);
296 btrfs_set_root_level(&root_item, 0); 382 btrfs_set_root_level(&root_item, 0);
@@ -360,7 +446,8 @@ fail:
360} 446}
361 447
362static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 448static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
363 char *name, int namelen, u64 *async_transid) 449 char *name, int namelen, u64 *async_transid,
450 bool readonly)
364{ 451{
365 struct inode *inode; 452 struct inode *inode;
366 struct dentry *parent; 453 struct dentry *parent;
@@ -378,6 +465,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
378 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 465 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 pending_snapshot->dentry = dentry; 466 pending_snapshot->dentry = dentry;
380 pending_snapshot->root = root; 467 pending_snapshot->root = root;
468 pending_snapshot->readonly = readonly;
381 469
382 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 470 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
383 if (IS_ERR(trans)) { 471 if (IS_ERR(trans)) {
@@ -404,7 +492,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
404 if (ret) 492 if (ret)
405 goto fail; 493 goto fail;
406 494
407 btrfs_orphan_cleanup(pending_snapshot->snap); 495 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
496 if (ret)
497 goto fail;
408 498
409 parent = dget_parent(dentry); 499 parent = dget_parent(dentry);
410 inode = btrfs_lookup_dentry(parent->d_inode, dentry); 500 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@ -509,7 +599,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
509static noinline int btrfs_mksubvol(struct path *parent, 599static noinline int btrfs_mksubvol(struct path *parent,
510 char *name, int namelen, 600 char *name, int namelen,
511 struct btrfs_root *snap_src, 601 struct btrfs_root *snap_src,
512 u64 *async_transid) 602 u64 *async_transid, bool readonly)
513{ 603{
514 struct inode *dir = parent->dentry->d_inode; 604 struct inode *dir = parent->dentry->d_inode;
515 struct dentry *dentry; 605 struct dentry *dentry;
@@ -541,7 +631,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
541 631
542 if (snap_src) { 632 if (snap_src) {
543 error = create_snapshot(snap_src, dentry, 633 error = create_snapshot(snap_src, dentry,
544 name, namelen, async_transid); 634 name, namelen, async_transid, readonly);
545 } else { 635 } else {
546 error = create_subvol(BTRFS_I(dir)->root, dentry, 636 error = create_subvol(BTRFS_I(dir)->root, dentry,
547 name, namelen, async_transid); 637 name, namelen, async_transid);
@@ -638,9 +728,11 @@ static int btrfs_defrag_file(struct file *file,
638 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 728 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
639 struct btrfs_ordered_extent *ordered; 729 struct btrfs_ordered_extent *ordered;
640 struct page *page; 730 struct page *page;
731 struct btrfs_super_block *disk_super;
641 unsigned long last_index; 732 unsigned long last_index;
642 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 733 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
643 unsigned long total_read = 0; 734 unsigned long total_read = 0;
735 u64 features;
644 u64 page_start; 736 u64 page_start;
645 u64 page_end; 737 u64 page_end;
646 u64 last_len = 0; 738 u64 last_len = 0;
@@ -648,6 +740,14 @@ static int btrfs_defrag_file(struct file *file,
648 u64 defrag_end = 0; 740 u64 defrag_end = 0;
649 unsigned long i; 741 unsigned long i;
650 int ret; 742 int ret;
743 int compress_type = BTRFS_COMPRESS_ZLIB;
744
745 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
746 if (range->compress_type > BTRFS_COMPRESS_TYPES)
747 return -EINVAL;
748 if (range->compress_type)
749 compress_type = range->compress_type;
750 }
651 751
652 if (inode->i_size == 0) 752 if (inode->i_size == 0)
653 return 0; 753 return 0;
@@ -683,7 +783,7 @@ static int btrfs_defrag_file(struct file *file,
683 total_read++; 783 total_read++;
684 mutex_lock(&inode->i_mutex); 784 mutex_lock(&inode->i_mutex);
685 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 785 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
686 BTRFS_I(inode)->force_compress = 1; 786 BTRFS_I(inode)->force_compress = compress_type;
687 787
688 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 788 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
689 if (ret) 789 if (ret)
@@ -781,10 +881,17 @@ loop_unlock:
781 atomic_dec(&root->fs_info->async_submit_draining); 881 atomic_dec(&root->fs_info->async_submit_draining);
782 882
783 mutex_lock(&inode->i_mutex); 883 mutex_lock(&inode->i_mutex);
784 BTRFS_I(inode)->force_compress = 0; 884 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
785 mutex_unlock(&inode->i_mutex); 885 mutex_unlock(&inode->i_mutex);
786 } 886 }
787 887
888 disk_super = &root->fs_info->super_copy;
889 features = btrfs_super_incompat_flags(disk_super);
890 if (range->compress_type == BTRFS_COMPRESS_LZO) {
891 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
892 btrfs_set_super_incompat_flags(disk_super, features);
893 }
894
788 return 0; 895 return 0;
789 896
790err_reservations: 897err_reservations:
@@ -885,6 +992,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
885 992
886 if (new_size > old_size) { 993 if (new_size > old_size) {
887 trans = btrfs_start_transaction(root, 0); 994 trans = btrfs_start_transaction(root, 0);
995 if (IS_ERR(trans)) {
996 ret = PTR_ERR(trans);
997 goto out_unlock;
998 }
888 ret = btrfs_grow_device(trans, device, new_size); 999 ret = btrfs_grow_device(trans, device, new_size);
889 btrfs_commit_transaction(trans, root); 1000 btrfs_commit_transaction(trans, root);
890 } else { 1001 } else {
@@ -901,7 +1012,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
901 char *name, 1012 char *name,
902 unsigned long fd, 1013 unsigned long fd,
903 int subvol, 1014 int subvol,
904 u64 *transid) 1015 u64 *transid,
1016 bool readonly)
905{ 1017{
906 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 1018 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
907 struct file *src_file; 1019 struct file *src_file;
@@ -919,7 +1031,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
919 1031
920 if (subvol) { 1032 if (subvol) {
921 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1033 ret = btrfs_mksubvol(&file->f_path, name, namelen,
922 NULL, transid); 1034 NULL, transid, readonly);
923 } else { 1035 } else {
924 struct inode *src_inode; 1036 struct inode *src_inode;
925 src_file = fget(fd); 1037 src_file = fget(fd);
@@ -938,7 +1050,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
938 } 1050 }
939 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1051 ret = btrfs_mksubvol(&file->f_path, name, namelen,
940 BTRFS_I(src_inode)->root, 1052 BTRFS_I(src_inode)->root,
941 transid); 1053 transid, readonly);
942 fput(src_file); 1054 fput(src_file);
943 } 1055 }
944out: 1056out:
@@ -946,61 +1058,145 @@ out:
946} 1058}
947 1059
948static noinline int btrfs_ioctl_snap_create(struct file *file, 1060static noinline int btrfs_ioctl_snap_create(struct file *file,
949 void __user *arg, int subvol, 1061 void __user *arg, int subvol)
950 int v2)
951{ 1062{
952 struct btrfs_ioctl_vol_args *vol_args = NULL; 1063 struct btrfs_ioctl_vol_args *vol_args;
953 struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
954 char *name;
955 u64 fd;
956 int ret; 1064 int ret;
957 1065
958 if (v2) { 1066 vol_args = memdup_user(arg, sizeof(*vol_args));
959 u64 transid = 0; 1067 if (IS_ERR(vol_args))
960 u64 *ptr = NULL; 1068 return PTR_ERR(vol_args);
1069 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
961 1070
962 vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); 1071 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
963 if (IS_ERR(vol_args_v2)) 1072 vol_args->fd, subvol,
964 return PTR_ERR(vol_args_v2); 1073 NULL, false);
965 1074
966 if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { 1075 kfree(vol_args);
967 ret = -EINVAL; 1076 return ret;
968 goto out; 1077}
969 }
970
971 name = vol_args_v2->name;
972 fd = vol_args_v2->fd;
973 vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
974 1078
975 if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1079static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
976 ptr = &transid; 1080 void __user *arg, int subvol)
1081{
1082 struct btrfs_ioctl_vol_args_v2 *vol_args;
1083 int ret;
1084 u64 transid = 0;
1085 u64 *ptr = NULL;
1086 bool readonly = false;
977 1087
978 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 1088 vol_args = memdup_user(arg, sizeof(*vol_args));
979 subvol, ptr); 1089 if (IS_ERR(vol_args))
1090 return PTR_ERR(vol_args);
1091 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
980 1092
981 if (ret == 0 && ptr && 1093 if (vol_args->flags &
982 copy_to_user(arg + 1094 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
983 offsetof(struct btrfs_ioctl_vol_args_v2, 1095 ret = -EOPNOTSUPP;
984 transid), ptr, sizeof(*ptr))) 1096 goto out;
985 ret = -EFAULT;
986 } else {
987 vol_args = memdup_user(arg, sizeof(*vol_args));
988 if (IS_ERR(vol_args))
989 return PTR_ERR(vol_args);
990 name = vol_args->name;
991 fd = vol_args->fd;
992 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
993
994 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
995 subvol, NULL);
996 } 1097 }
1098
1099 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1100 ptr = &transid;
1101 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1102 readonly = true;
1103
1104 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1105 vol_args->fd, subvol,
1106 ptr, readonly);
1107
1108 if (ret == 0 && ptr &&
1109 copy_to_user(arg +
1110 offsetof(struct btrfs_ioctl_vol_args_v2,
1111 transid), ptr, sizeof(*ptr)))
1112 ret = -EFAULT;
997out: 1113out:
998 kfree(vol_args); 1114 kfree(vol_args);
999 kfree(vol_args_v2); 1115 return ret;
1116}
1117
1118static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1119 void __user *arg)
1120{
1121 struct inode *inode = fdentry(file)->d_inode;
1122 struct btrfs_root *root = BTRFS_I(inode)->root;
1123 int ret = 0;
1124 u64 flags = 0;
1125
1126 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1127 return -EINVAL;
1128
1129 down_read(&root->fs_info->subvol_sem);
1130 if (btrfs_root_readonly(root))
1131 flags |= BTRFS_SUBVOL_RDONLY;
1132 up_read(&root->fs_info->subvol_sem);
1133
1134 if (copy_to_user(arg, &flags, sizeof(flags)))
1135 ret = -EFAULT;
1000 1136
1001 return ret; 1137 return ret;
1002} 1138}
1003 1139
1140static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1141 void __user *arg)
1142{
1143 struct inode *inode = fdentry(file)->d_inode;
1144 struct btrfs_root *root = BTRFS_I(inode)->root;
1145 struct btrfs_trans_handle *trans;
1146 u64 root_flags;
1147 u64 flags;
1148 int ret = 0;
1149
1150 if (root->fs_info->sb->s_flags & MS_RDONLY)
1151 return -EROFS;
1152
1153 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1154 return -EINVAL;
1155
1156 if (copy_from_user(&flags, arg, sizeof(flags)))
1157 return -EFAULT;
1158
1159 if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
1160 return -EINVAL;
1161
1162 if (flags & ~BTRFS_SUBVOL_RDONLY)
1163 return -EOPNOTSUPP;
1164
1165 if (!inode_owner_or_capable(inode))
1166 return -EACCES;
1167
1168 down_write(&root->fs_info->subvol_sem);
1169
1170 /* nothing to do */
1171 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1172 goto out;
1173
1174 root_flags = btrfs_root_flags(&root->root_item);
1175 if (flags & BTRFS_SUBVOL_RDONLY)
1176 btrfs_set_root_flags(&root->root_item,
1177 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1178 else
1179 btrfs_set_root_flags(&root->root_item,
1180 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1181
1182 trans = btrfs_start_transaction(root, 1);
1183 if (IS_ERR(trans)) {
1184 ret = PTR_ERR(trans);
1185 goto out_reset;
1186 }
1187
1188 ret = btrfs_update_root(trans, root->fs_info->tree_root,
1189 &root->root_key, &root->root_item);
1190
1191 btrfs_commit_transaction(trans, root);
1192out_reset:
1193 if (ret)
1194 btrfs_set_root_flags(&root->root_item, root_flags);
1195out:
1196 up_write(&root->fs_info->subvol_sem);
1197 return ret;
1198}
1199
1004/* 1200/*
1005 * helper to check if the subvolume references other subvolumes 1201 * helper to check if the subvolume references other subvolumes
1006 */ 1202 */
@@ -1509,6 +1705,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1509 struct btrfs_ioctl_defrag_range_args *range; 1705 struct btrfs_ioctl_defrag_range_args *range;
1510 int ret; 1706 int ret;
1511 1707
1708 if (btrfs_root_readonly(root))
1709 return -EROFS;
1710
1512 ret = mnt_want_write(file->f_path.mnt); 1711 ret = mnt_want_write(file->f_path.mnt);
1513 if (ret) 1712 if (ret)
1514 return ret; 1713 return ret;
@@ -1637,6 +1836,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1637 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1836 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1638 return -EINVAL; 1837 return -EINVAL;
1639 1838
1839 if (btrfs_root_readonly(root))
1840 return -EROFS;
1841
1640 ret = mnt_want_write(file->f_path.mnt); 1842 ret = mnt_want_write(file->f_path.mnt);
1641 if (ret) 1843 if (ret)
1642 return ret; 1844 return ret;
@@ -1788,7 +1990,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1788 1990
1789 memcpy(&new_key, &key, sizeof(new_key)); 1991 memcpy(&new_key, &key, sizeof(new_key));
1790 new_key.objectid = inode->i_ino; 1992 new_key.objectid = inode->i_ino;
1791 new_key.offset = key.offset + destoff - off; 1993 if (off <= key.offset)
1994 new_key.offset = key.offset + destoff - off;
1995 else
1996 new_key.offset = destoff;
1792 1997
1793 trans = btrfs_start_transaction(root, 1); 1998 trans = btrfs_start_transaction(root, 1);
1794 if (IS_ERR(trans)) { 1999 if (IS_ERR(trans)) {
@@ -1958,6 +2163,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1958 if (file->private_data) 2163 if (file->private_data)
1959 goto out; 2164 goto out;
1960 2165
2166 ret = -EROFS;
2167 if (btrfs_root_readonly(root))
2168 goto out;
2169
1961 ret = mnt_want_write(file->f_path.mnt); 2170 ret = mnt_want_write(file->f_path.mnt);
1962 if (ret) 2171 if (ret)
1963 goto out; 2172 goto out;
@@ -1968,7 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
1968 2177
1969 ret = -ENOMEM; 2178 ret = -ENOMEM;
1970 trans = btrfs_start_ioctl_transaction(root, 0); 2179 trans = btrfs_start_ioctl_transaction(root, 0);
1971 if (!trans) 2180 if (IS_ERR(trans))
1972 goto out_drop; 2181 goto out_drop;
1973 2182
1974 file->private_data = trans; 2183 file->private_data = trans;
@@ -2024,9 +2233,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2024 path->leave_spinning = 1; 2233 path->leave_spinning = 1;
2025 2234
2026 trans = btrfs_start_transaction(root, 1); 2235 trans = btrfs_start_transaction(root, 1);
2027 if (!trans) { 2236 if (IS_ERR(trans)) {
2028 btrfs_free_path(path); 2237 btrfs_free_path(path);
2029 return -ENOMEM; 2238 return PTR_ERR(trans);
2030 } 2239 }
2031 2240
2032 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2241 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2078,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2078 struct btrfs_ioctl_space_info space; 2287 struct btrfs_ioctl_space_info space;
2079 struct btrfs_ioctl_space_info *dest; 2288 struct btrfs_ioctl_space_info *dest;
2080 struct btrfs_ioctl_space_info *dest_orig; 2289 struct btrfs_ioctl_space_info *dest_orig;
2081 struct btrfs_ioctl_space_info *user_dest; 2290 struct btrfs_ioctl_space_info __user *user_dest;
2082 struct btrfs_space_info *info; 2291 struct btrfs_space_info *info;
2083 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 2292 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2084 BTRFS_BLOCK_GROUP_SYSTEM, 2293 BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2087,7 +2296,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2087 int num_types = 4; 2296 int num_types = 4;
2088 int alloc_size; 2297 int alloc_size;
2089 int ret = 0; 2298 int ret = 0;
2090 int slot_count = 0; 2299 u64 slot_count = 0;
2091 int i, c; 2300 int i, c;
2092 2301
2093 if (copy_from_user(&space_args, 2302 if (copy_from_user(&space_args,
@@ -2126,7 +2335,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2126 goto out; 2335 goto out;
2127 } 2336 }
2128 2337
2129 slot_count = min_t(int, space_args.space_slots, slot_count); 2338 slot_count = min_t(u64, space_args.space_slots, slot_count);
2130 2339
2131 alloc_size = sizeof(*dest) * slot_count; 2340 alloc_size = sizeof(*dest) * slot_count;
2132 2341
@@ -2146,6 +2355,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2146 for (i = 0; i < num_types; i++) { 2355 for (i = 0; i < num_types; i++) {
2147 struct btrfs_space_info *tmp; 2356 struct btrfs_space_info *tmp;
2148 2357
2358 if (!slot_count)
2359 break;
2360
2149 info = NULL; 2361 info = NULL;
2150 rcu_read_lock(); 2362 rcu_read_lock();
2151 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 2363 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2167,7 +2379,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2167 memcpy(dest, &space, sizeof(space)); 2379 memcpy(dest, &space, sizeof(space));
2168 dest++; 2380 dest++;
2169 space_args.total_spaces++; 2381 space_args.total_spaces++;
2382 slot_count--;
2170 } 2383 }
2384 if (!slot_count)
2385 break;
2171 } 2386 }
2172 up_read(&info->groups_sem); 2387 up_read(&info->groups_sem);
2173 } 2388 }
@@ -2218,10 +2433,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
2218 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 2433 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2219 struct btrfs_trans_handle *trans; 2434 struct btrfs_trans_handle *trans;
2220 u64 transid; 2435 u64 transid;
2436 int ret;
2221 2437
2222 trans = btrfs_start_transaction(root, 0); 2438 trans = btrfs_start_transaction(root, 0);
2439 if (IS_ERR(trans))
2440 return PTR_ERR(trans);
2223 transid = trans->transid; 2441 transid = trans->transid;
2224 btrfs_commit_transaction_async(trans, root, 0); 2442 ret = btrfs_commit_transaction_async(trans, root, 0);
2443 if (ret) {
2444 btrfs_end_transaction(trans, root);
2445 return ret;
2446 }
2225 2447
2226 if (argp) 2448 if (argp)
2227 if (copy_to_user(argp, &transid, sizeof(transid))) 2449 if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -2256,14 +2478,20 @@ long btrfs_ioctl(struct file *file, unsigned int
2256 return btrfs_ioctl_setflags(file, argp); 2478 return btrfs_ioctl_setflags(file, argp);
2257 case FS_IOC_GETVERSION: 2479 case FS_IOC_GETVERSION:
2258 return btrfs_ioctl_getversion(file, argp); 2480 return btrfs_ioctl_getversion(file, argp);
2481 case FITRIM:
2482 return btrfs_ioctl_fitrim(file, argp);
2259 case BTRFS_IOC_SNAP_CREATE: 2483 case BTRFS_IOC_SNAP_CREATE:
2260 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2484 return btrfs_ioctl_snap_create(file, argp, 0);
2261 case BTRFS_IOC_SNAP_CREATE_V2: 2485 case BTRFS_IOC_SNAP_CREATE_V2:
2262 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2486 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2263 case BTRFS_IOC_SUBVOL_CREATE: 2487 case BTRFS_IOC_SUBVOL_CREATE:
2264 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2488 return btrfs_ioctl_snap_create(file, argp, 1);
2265 case BTRFS_IOC_SNAP_DESTROY: 2489 case BTRFS_IOC_SNAP_DESTROY:
2266 return btrfs_ioctl_snap_destroy(file, argp); 2490 return btrfs_ioctl_snap_destroy(file, argp);
2491 case BTRFS_IOC_SUBVOL_GETFLAGS:
2492 return btrfs_ioctl_subvol_getflags(file, argp);
2493 case BTRFS_IOC_SUBVOL_SETFLAGS:
2494 return btrfs_ioctl_subvol_setflags(file, argp);
2267 case BTRFS_IOC_DEFAULT_SUBVOL: 2495 case BTRFS_IOC_DEFAULT_SUBVOL:
2268 return btrfs_ioctl_default_subvol(file, argp); 2496 return btrfs_ioctl_default_subvol(file, argp);
2269 case BTRFS_IOC_DEFRAG: 2497 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
34 35
35#define BTRFS_SUBVOL_NAME_MAX 4039 36#define BTRFS_SUBVOL_NAME_MAX 4039
36struct btrfs_ioctl_vol_args_v2 { 37struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
133 */ 134 */
134 __u32 extent_thresh; 135 __u32 extent_thresh;
135 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
136 /* spare for later */ 144 /* spare for later */
137 __u32 unused[5]; 145 __u32 unused[4];
138}; 146};
139 147
140struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
193#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
194#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
195 struct btrfs_ioctl_vol_args_v2) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
196#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..a178f5ebea78
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,427 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283 bool may_late_unmap, need_unmap;
284
285 data_in = kmap(pages_in[0]);
286 tot_len = read_compress_length(data_in);
287
288 tot_in = LZO_LEN;
289 in_offset = LZO_LEN;
290 tot_len = min_t(size_t, srclen, tot_len);
291 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
292
293 tot_out = 0;
294 pg_offset = 0;
295
296 while (tot_in < tot_len) {
297 in_len = read_compress_length(data_in + in_offset);
298 in_page_bytes_left -= LZO_LEN;
299 in_offset += LZO_LEN;
300 tot_in += LZO_LEN;
301
302 tot_in += in_len;
303 working_bytes = in_len;
304 may_late_unmap = need_unmap = false;
305
306 /* fast path: avoid using the working buffer */
307 if (in_page_bytes_left >= in_len) {
308 buf = data_in + in_offset;
309 bytes = in_len;
310 may_late_unmap = true;
311 goto cont;
312 }
313
314 /* copy bytes from the pages into the working buffer */
315 buf = workspace->cbuf;
316 buf_offset = 0;
317 while (working_bytes) {
318 bytes = min(working_bytes, in_page_bytes_left);
319
320 memcpy(buf + buf_offset, data_in + in_offset, bytes);
321 buf_offset += bytes;
322cont:
323 working_bytes -= bytes;
324 in_page_bytes_left -= bytes;
325 in_offset += bytes;
326
327 /* check if we need to pick another page */
328 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
329 || in_page_bytes_left == 0) {
330 tot_in += in_page_bytes_left;
331
332 if (working_bytes == 0 && tot_in >= tot_len)
333 break;
334
335 if (page_in_index + 1 >= total_pages_in) {
336 ret = -1;
337 goto done;
338 }
339
340 if (may_late_unmap)
341 need_unmap = true;
342 else
343 kunmap(pages_in[page_in_index]);
344
345 data_in = kmap(pages_in[++page_in_index]);
346
347 in_page_bytes_left = PAGE_CACHE_SIZE;
348 in_offset = 0;
349 }
350 }
351
352 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
353 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
354 &out_len);
355 if (need_unmap)
356 kunmap(pages_in[page_in_index - 1]);
357 if (ret != LZO_E_OK) {
358 printk(KERN_WARNING "btrfs decompress failed\n");
359 ret = -1;
360 break;
361 }
362
363 buf_start = tot_out;
364 tot_out += out_len;
365
366 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
367 tot_out, disk_start,
368 bvec, vcnt,
369 &page_out_index, &pg_offset);
370 if (ret2 == 0)
371 break;
372 }
373done:
374 kunmap(pages_in[page_in_index]);
375 return ret;
376}
377
378static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
379 struct page *dest_page,
380 unsigned long start_byte,
381 size_t srclen, size_t destlen)
382{
383 struct workspace *workspace = list_entry(ws, struct workspace, list);
384 size_t in_len;
385 size_t out_len;
386 size_t tot_len;
387 int ret = 0;
388 char *kaddr;
389 unsigned long bytes;
390
391 BUG_ON(srclen < LZO_LEN);
392
393 tot_len = read_compress_length(data_in);
394 data_in += LZO_LEN;
395
396 in_len = read_compress_length(data_in);
397 data_in += LZO_LEN;
398
399 out_len = PAGE_CACHE_SIZE;
400 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
401 if (ret != LZO_E_OK) {
402 printk(KERN_WARNING "btrfs decompress failed!\n");
403 ret = -1;
404 goto out;
405 }
406
407 if (out_len < start_byte) {
408 ret = -1;
409 goto out;
410 }
411
412 bytes = min_t(unsigned long, destlen, out_len - start_byte);
413
414 kaddr = kmap_atomic(dest_page, KM_USER0);
415 memcpy(kaddr, workspace->buf + start_byte, bytes);
416 kunmap_atomic(kaddr, KM_USER0);
417out:
418 return ret;
419}
420
421struct btrfs_compress_op btrfs_lzo_compress = {
422 .alloc_workspace = lzo_alloc_workspace,
423 .free_workspace = lzo_free_workspace,
424 .compress_pages = lzo_compress_pages,
425 .decompress_biovec = lzo_decompress_biovec,
426 .decompress = lzo_decompress,
427};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
141 u64 file_offset) 141 u64 file_offset)
142{ 142{
143 struct rb_root *root = &tree->tree; 143 struct rb_root *root = &tree->tree;
144 struct rb_node *prev; 144 struct rb_node *prev = NULL;
145 struct rb_node *ret; 145 struct rb_node *ret;
146 struct btrfs_ordered_extent *entry; 146 struct btrfs_ordered_extent *entry;
147 147
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -201,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
201 INIT_LIST_HEAD(&entry->list); 202 INIT_LIST_HEAD(&entry->list);
202 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->root_extent_list);
203 204
205 trace_btrfs_ordered_extent_add(inode, entry);
206
204 spin_lock(&tree->lock); 207 spin_lock(&tree->lock);
205 node = tree_insert(&tree->tree, file_offset, 208 node = tree_insert(&tree->tree, file_offset,
206 &entry->rb_node); 209 &entry->rb_node);
@@ -220,14 +223,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 223 u64 start, u64 len, u64 disk_len, int type)
221{ 224{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 225 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 226 disk_len, type, 0,
227 BTRFS_COMPRESS_NONE);
224} 228}
225 229
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 230int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 231 u64 start, u64 len, u64 disk_len, int type)
228{ 232{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 233 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 234 disk_len, type, 1,
235 BTRFS_COMPRESS_NONE);
236}
237
238int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
239 u64 start, u64 len, u64 disk_len,
240 int type, int compress_type)
241{
242 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
243 disk_len, type, 0,
244 compress_type);
231} 245}
232 246
233/* 247/*
@@ -375,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
375 struct list_head *cur; 389 struct list_head *cur;
376 struct btrfs_ordered_sum *sum; 390 struct btrfs_ordered_sum *sum;
377 391
392 trace_btrfs_ordered_extent_put(entry->inode, entry);
393
378 if (atomic_dec_and_test(&entry->refs)) { 394 if (atomic_dec_and_test(&entry->refs)) {
379 while (!list_empty(&entry->list)) { 395 while (!list_empty(&entry->list)) {
380 cur = entry->list.next; 396 cur = entry->list.next;
@@ -408,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
408 spin_lock(&root->fs_info->ordered_extent_lock); 424 spin_lock(&root->fs_info->ordered_extent_lock);
409 list_del_init(&entry->root_extent_list); 425 list_del_init(&entry->root_extent_list);
410 426
427 trace_btrfs_ordered_extent_remove(inode, entry);
428
411 /* 429 /*
412 * we have no more ordered extents for this inode and 430 * we have no more ordered extents for this inode and
413 * no dirty pages. We can safely remove it from the 431 * no dirty pages. We can safely remove it from the
@@ -573,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
573 u64 start = entry->file_offset; 591 u64 start = entry->file_offset;
574 u64 end = start + entry->len - 1; 592 u64 end = start + entry->len - 1;
575 593
594 trace_btrfs_ordered_extent_start(inode, entry);
595
576 /* 596 /*
577 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
578 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
148 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
149int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
150 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
151int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
152 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
153 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
260#else 260#else
261 BUG(); 261 BUG();
262#endif 262#endif
263 break;
263 case BTRFS_BLOCK_GROUP_ITEM_KEY: 264 case BTRFS_BLOCK_GROUP_ITEM_KEY:
264 bi = btrfs_item_ptr(l, i, 265 bi = btrfs_item_ptr(l, i,
265 struct btrfs_block_group_item); 266 struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..199a80134312 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1157 new_node->bytenr = dest->node->start; 1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level; 1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest; 1159 new_node->lowest = node->lowest;
1160 new_node->checked = 1;
1160 new_node->root = dest; 1161 new_node->root = dest;
1161 1162
1162 if (!node->lowest) { 1163 if (!node->lowest) {
@@ -1723,6 +1724,7 @@ again:
1723 1724
1724 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1725 old_ptr_gen); 1726 old_ptr_gen);
1727 BUG_ON(!eb);
1726 btrfs_tree_lock(eb); 1728 btrfs_tree_lock(eb);
1727 if (cow) { 1729 if (cow) {
1728 ret = btrfs_cow_block(trans, dest, eb, parent, 1730 ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -2028,6 +2030,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2028 2030
2029 while (1) { 2031 while (1) {
2030 trans = btrfs_start_transaction(root, 0); 2032 trans = btrfs_start_transaction(root, 0);
2033 BUG_ON(IS_ERR(trans));
2031 trans->block_rsv = rc->block_rsv; 2034 trans->block_rsv = rc->block_rsv;
2032 2035
2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2036 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2150,12 @@ again:
2147 } 2150 }
2148 2151
2149 trans = btrfs_join_transaction(rc->extent_root, 1); 2152 trans = btrfs_join_transaction(rc->extent_root, 1);
2153 if (IS_ERR(trans)) {
2154 if (!err)
2155 btrfs_block_rsv_release(rc->extent_root,
2156 rc->block_rsv, num_bytes);
2157 return PTR_ERR(trans);
2158 }
2150 2159
2151 if (!err) { 2160 if (!err) {
2152 if (num_bytes != rc->merging_rsv_size) { 2161 if (num_bytes != rc->merging_rsv_size) {
@@ -2337,7 +2346,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2337 root = next->root; 2346 root = next->root;
2338 BUG_ON(!root); 2347 BUG_ON(!root);
2339 2348
2340 /* no other choice for non-refernce counted tree */ 2349 /* no other choice for non-references counted tree */
2341 if (!root->ref_cows) 2350 if (!root->ref_cows)
2342 return root; 2351 return root;
2343 2352
@@ -2505,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2505 blocksize = btrfs_level_size(root, node->level); 2514 blocksize = btrfs_level_size(root, node->level);
2506 generation = btrfs_node_ptr_generation(upper->eb, slot); 2515 generation = btrfs_node_ptr_generation(upper->eb, slot);
2507 eb = read_tree_block(root, bytenr, blocksize, generation); 2516 eb = read_tree_block(root, bytenr, blocksize, generation);
2517 if (!eb) {
2518 err = -EIO;
2519 goto next;
2520 }
2508 btrfs_tree_lock(eb); 2521 btrfs_tree_lock(eb);
2509 btrfs_set_lock_blocking(eb); 2522 btrfs_set_lock_blocking(eb);
2510 2523
@@ -2662,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2662 BUG_ON(block->key_ready); 2675 BUG_ON(block->key_ready);
2663 eb = read_tree_block(rc->extent_root, block->bytenr, 2676 eb = read_tree_block(rc->extent_root, block->bytenr,
2664 block->key.objectid, block->key.offset); 2677 block->key.objectid, block->key.offset);
2678 BUG_ON(!eb);
2665 WARN_ON(btrfs_header_level(eb) != block->level); 2679 WARN_ON(btrfs_header_level(eb) != block->level);
2666 if (block->level == 0) 2680 if (block->level == 0)
2667 btrfs_item_key_to_cpu(eb, &block->key, 0); 2681 btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -3222,6 +3236,7 @@ truncate:
3222 trans = btrfs_join_transaction(root, 0); 3236 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) { 3237 if (IS_ERR(trans)) {
3224 btrfs_free_path(path); 3238 btrfs_free_path(path);
3239 ret = PTR_ERR(trans);
3225 goto out; 3240 goto out;
3226 } 3241 }
3227 3242
@@ -3628,6 +3643,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3628 set_reloc_control(rc); 3643 set_reloc_control(rc);
3629 3644
3630 trans = btrfs_join_transaction(rc->extent_root, 1); 3645 trans = btrfs_join_transaction(rc->extent_root, 1);
3646 BUG_ON(IS_ERR(trans));
3631 btrfs_commit_transaction(trans, rc->extent_root); 3647 btrfs_commit_transaction(trans, rc->extent_root);
3632 return 0; 3648 return 0;
3633} 3649}
@@ -3644,6 +3660,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3644 u32 item_size; 3660 u32 item_size;
3645 int ret; 3661 int ret;
3646 int err = 0; 3662 int err = 0;
3663 int progress = 0;
3647 3664
3648 path = btrfs_alloc_path(); 3665 path = btrfs_alloc_path();
3649 if (!path) 3666 if (!path)
@@ -3656,8 +3673,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3656 } 3673 }
3657 3674
3658 while (1) { 3675 while (1) {
3676 progress++;
3659 trans = btrfs_start_transaction(rc->extent_root, 0); 3677 trans = btrfs_start_transaction(rc->extent_root, 0);
3660 3678 BUG_ON(IS_ERR(trans));
3679restart:
3661 if (update_backref_cache(trans, &rc->backref_cache)) { 3680 if (update_backref_cache(trans, &rc->backref_cache)) {
3662 btrfs_end_transaction(trans, rc->extent_root); 3681 btrfs_end_transaction(trans, rc->extent_root);
3663 continue; 3682 continue;
@@ -3770,6 +3789,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3770 } 3789 }
3771 } 3790 }
3772 } 3791 }
3792 if (trans && progress && err == -ENOSPC) {
3793 ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3794 rc->block_group->flags);
3795 if (ret == 0) {
3796 err = 0;
3797 progress = 0;
3798 goto restart;
3799 }
3800 }
3773 3801
3774 btrfs_release_path(rc->extent_root, path); 3802 btrfs_release_path(rc->extent_root, path);
3775 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3803 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3804,7 +3832,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3804 3832
3805 /* get rid of pinned extents */ 3833 /* get rid of pinned extents */
3806 trans = btrfs_join_transaction(rc->extent_root, 1); 3834 trans = btrfs_join_transaction(rc->extent_root, 1);
3807 btrfs_commit_transaction(trans, rc->extent_root); 3835 if (IS_ERR(trans))
3836 err = PTR_ERR(trans);
3837 else
3838 btrfs_commit_transaction(trans, rc->extent_root);
3808out_free: 3839out_free:
3809 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 3840 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3810 btrfs_free_path(path); 3841 btrfs_free_path(path);
@@ -4022,6 +4053,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4022 int ret; 4053 int ret;
4023 4054
4024 trans = btrfs_start_transaction(root->fs_info->tree_root, 0); 4055 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4056 BUG_ON(IS_ERR(trans));
4025 4057
4026 memset(&root->root_item.drop_progress, 0, 4058 memset(&root->root_item.drop_progress, 0,
4027 sizeof(root->root_item.drop_progress)); 4059 sizeof(root->root_item.drop_progress));
@@ -4125,6 +4157,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4125 set_reloc_control(rc); 4157 set_reloc_control(rc);
4126 4158
4127 trans = btrfs_join_transaction(rc->extent_root, 1); 4159 trans = btrfs_join_transaction(rc->extent_root, 1);
4160 if (IS_ERR(trans)) {
4161 unset_reloc_control(rc);
4162 err = PTR_ERR(trans);
4163 goto out_free;
4164 }
4128 4165
4129 rc->merge_reloc_tree = 1; 4166 rc->merge_reloc_tree = 1;
4130 4167
@@ -4154,9 +4191,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4154 unset_reloc_control(rc); 4191 unset_reloc_control(rc);
4155 4192
4156 trans = btrfs_join_transaction(rc->extent_root, 1); 4193 trans = btrfs_join_transaction(rc->extent_root, 1);
4157 btrfs_commit_transaction(trans, rc->extent_root); 4194 if (IS_ERR(trans))
4158out: 4195 err = PTR_ERR(trans);
4196 else
4197 btrfs_commit_transaction(trans, rc->extent_root);
4198out_free:
4159 kfree(rc); 4199 kfree(rc);
4200out:
4160 while (!list_empty(&reloc_roots)) { 4201 while (!list_empty(&reloc_roots)) {
4161 reloc_root = list_entry(reloc_roots.next, 4202 reloc_root = list_entry(reloc_roots.next,
4162 struct btrfs_root, root_list); 4203 struct btrfs_root, root_list);
@@ -4174,7 +4215,7 @@ out:
4174 if (IS_ERR(fs_root)) 4215 if (IS_ERR(fs_root))
4175 err = PTR_ERR(fs_root); 4216 err = PTR_ERR(fs_root);
4176 else 4217 else
4177 btrfs_orphan_cleanup(fs_root); 4218 err = btrfs_orphan_cleanup(fs_root);
4178 } 4219 }
4179 return err; 4220 return err;
4180} 4221}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ffc..6928bff62daa 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
88 search_key.offset = (u64)-1; 88 search_key.offset = (u64)-1;
89 89
90 path = btrfs_alloc_path(); 90 path = btrfs_alloc_path();
91 BUG_ON(!path); 91 if (!path)
92 return -ENOMEM;
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 93 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0) 94 if (ret < 0)
94 goto out; 95 goto out;
@@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 struct extent_buffer *leaf; 333 struct extent_buffer *leaf;
333 334
334 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
335 BUG_ON(!path); 336 if (!path)
337 return -ENOMEM;
336 ret = btrfs_search_slot(trans, root, key, path, -1, 1); 338 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
337 if (ret < 0) 339 if (ret < 0)
338 goto out; 340 goto out;
@@ -471,3 +473,21 @@ again:
471 btrfs_free_path(path); 473 btrfs_free_path(path);
472 return 0; 474 return 0;
473} 475}
476
477/*
478 * Old btrfs forgets to init root_item->flags and root_item->byte_limit
479 * for subvolumes. To work around this problem, we steal a bit from
480 * root_item->inode_item->flags, and use it to indicate if those fields
481 * have been properly initialized.
482 */
483void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
484{
485 u64 inode_flags = le64_to_cpu(root_item->inode.flags);
486
487 if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
488 inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
489 root_item->inode.flags = cpu_to_le64(inode_flags);
490 root_item->flags = 0;
491 root_item->byte_limit = 0;
492 }
493}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 22acdaa78ce1..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,8 +52,95 @@
52#include "export.h" 52#include "export.h"
53#include "compression.h" 53#include "compression.h"
54 54
55#define CREATE_TRACE_POINTS
56#include <trace/events/btrfs.h>
57
55static const struct super_operations btrfs_super_ops; 58static const struct super_operations btrfs_super_ops;
56 59
60static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
61 char nbuf[16])
62{
63 char *errstr = NULL;
64
65 switch (errno) {
66 case -EIO:
67 errstr = "IO failure";
68 break;
69 case -ENOMEM:
70 errstr = "Out of memory";
71 break;
72 case -EROFS:
73 errstr = "Readonly filesystem";
74 break;
75 default:
76 if (nbuf) {
77 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
78 errstr = nbuf;
79 }
80 break;
81 }
82
83 return errstr;
84}
85
86static void __save_error_info(struct btrfs_fs_info *fs_info)
87{
88 /*
89 * today we only save the error info into ram. Long term we'll
90 * also send it down to the disk
91 */
92 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
93}
94
95/* NOTE:
96 * We move write_super stuff at umount in order to avoid deadlock
97 * for umount hold all lock.
98 */
99static void save_error_info(struct btrfs_fs_info *fs_info)
100{
101 __save_error_info(fs_info);
102}
103
104/* btrfs handle error by forcing the filesystem readonly */
105static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
106{
107 struct super_block *sb = fs_info->sb;
108
109 if (sb->s_flags & MS_RDONLY)
110 return;
111
112 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
113 sb->s_flags |= MS_RDONLY;
114 printk(KERN_INFO "btrfs is forced readonly\n");
115 }
116}
117
118/*
119 * __btrfs_std_error decodes expected errors from the caller and
120 * invokes the approciate error response.
121 */
122void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
123 unsigned int line, int errno)
124{
125 struct super_block *sb = fs_info->sb;
126 char nbuf[16];
127 const char *errstr;
128
129 /*
130 * Special case: if the error is EROFS, and we're already
131 * under MS_RDONLY, then it is safe here.
132 */
133 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
134 return;
135
136 errstr = btrfs_decode_error(fs_info, errno, nbuf);
137 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
138 sb->s_id, function, line, errstr);
139 save_error_info(fs_info);
140
141 btrfs_handle_error(fs_info);
142}
143
57static void btrfs_put_super(struct super_block *sb) 144static void btrfs_put_super(struct super_block *sb)
58{ 145{
59 struct btrfs_root *root = btrfs_sb(sb); 146 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +156,10 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 156 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 157 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 158 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 159 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 160 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 161 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
162 Opt_enospc_debug, Opt_subvolrootid, Opt_err,
75}; 163};
76 164
77static match_table_t tokens = { 165static match_table_t tokens = {
@@ -86,7 +174,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 174 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 175 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 176 {Opt_compress, "compress"},
177 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 178 {Opt_compress_force, "compress-force"},
179 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 180 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 181 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 182 {Opt_nossd, "nossd"},
@@ -98,6 +188,8 @@ static match_table_t tokens = {
98 {Opt_space_cache, "space_cache"}, 188 {Opt_space_cache, "space_cache"},
99 {Opt_clear_cache, "clear_cache"}, 189 {Opt_clear_cache, "clear_cache"},
100 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 190 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
191 {Opt_enospc_debug, "enospc_debug"},
192 {Opt_subvolrootid, "subvolrootid=%d"},
101 {Opt_err, NULL}, 193 {Opt_err, NULL},
102}; 194};
103 195
@@ -112,6 +204,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 204 char *p, *num, *orig;
113 int intarg; 205 int intarg;
114 int ret = 0; 206 int ret = 0;
207 char *compress_type;
208 bool compress_force = false;
115 209
116 if (!options) 210 if (!options)
117 return 0; 211 return 0;
@@ -139,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
139 break; 233 break;
140 case Opt_subvol: 234 case Opt_subvol:
141 case Opt_subvolid: 235 case Opt_subvolid:
236 case Opt_subvolrootid:
142 case Opt_device: 237 case Opt_device:
143 /* 238 /*
144 * These are parsed by btrfs_parse_early_options 239 * These are parsed by btrfs_parse_early_options
@@ -154,14 +249,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 249 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 250 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 251 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 252 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 253 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 254 compress_force = true;
255 case Opt_compress:
256 case Opt_compress_type:
257 if (token == Opt_compress ||
258 token == Opt_compress_force ||
259 strcmp(args[0].from, "zlib") == 0) {
260 compress_type = "zlib";
261 info->compress_type = BTRFS_COMPRESS_ZLIB;
262 } else if (strcmp(args[0].from, "lzo") == 0) {
263 compress_type = "lzo";
264 info->compress_type = BTRFS_COMPRESS_LZO;
265 } else {
266 ret = -EINVAL;
267 goto out;
268 }
269
164 btrfs_set_opt(info->mount_opt, COMPRESS); 270 btrfs_set_opt(info->mount_opt, COMPRESS);
271 if (compress_force) {
272 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
273 pr_info("btrfs: force %s compression\n",
274 compress_type);
275 } else
276 pr_info("btrfs: use %s compression\n",
277 compress_type);
165 break; 278 break;
166 case Opt_ssd: 279 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 280 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -252,6 +365,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
252 case Opt_user_subvol_rm_allowed: 365 case Opt_user_subvol_rm_allowed:
253 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); 366 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
254 break; 367 break;
368 case Opt_enospc_debug:
369 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
370 break;
255 case Opt_err: 371 case Opt_err:
256 printk(KERN_INFO "btrfs: unrecognized mount option " 372 printk(KERN_INFO "btrfs: unrecognized mount option "
257 "'%s'\n", p); 373 "'%s'\n", p);
@@ -274,10 +390,10 @@ out:
274 */ 390 */
275static int btrfs_parse_early_options(const char *options, fmode_t flags, 391static int btrfs_parse_early_options(const char *options, fmode_t flags,
276 void *holder, char **subvol_name, u64 *subvol_objectid, 392 void *holder, char **subvol_name, u64 *subvol_objectid,
277 struct btrfs_fs_devices **fs_devices) 393 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
278{ 394{
279 substring_t args[MAX_OPT_ARGS]; 395 substring_t args[MAX_OPT_ARGS];
280 char *opts, *p; 396 char *opts, *orig, *p;
281 int error = 0; 397 int error = 0;
282 int intarg; 398 int intarg;
283 399
@@ -291,6 +407,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
291 opts = kstrdup(options, GFP_KERNEL); 407 opts = kstrdup(options, GFP_KERNEL);
292 if (!opts) 408 if (!opts)
293 return -ENOMEM; 409 return -ENOMEM;
410 orig = opts;
294 411
295 while ((p = strsep(&opts, ",")) != NULL) { 412 while ((p = strsep(&opts, ",")) != NULL) {
296 int token; 413 int token;
@@ -314,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
314 *subvol_objectid = intarg; 431 *subvol_objectid = intarg;
315 } 432 }
316 break; 433 break;
434 case Opt_subvolrootid:
435 intarg = 0;
436 error = match_int(&args[0], &intarg);
437 if (!error) {
438 /* we want the original fs_tree */
439 if (!intarg)
440 *subvol_rootid =
441 BTRFS_FS_TREE_OBJECTID;
442 else
443 *subvol_rootid = intarg;
444 }
445 break;
317 case Opt_device: 446 case Opt_device:
318 error = btrfs_scan_one_device(match_strdup(&args[0]), 447 error = btrfs_scan_one_device(match_strdup(&args[0]),
319 flags, holder, fs_devices); 448 flags, holder, fs_devices);
@@ -326,7 +455,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
326 } 455 }
327 456
328 out_free_opts: 457 out_free_opts:
329 kfree(opts); 458 kfree(orig);
330 out: 459 out:
331 /* 460 /*
332 * If no subvolume name is specified we use the default one. Allocate 461 * If no subvolume name is specified we use the default one. Allocate
@@ -508,6 +637,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
508 struct btrfs_root *root = btrfs_sb(sb); 637 struct btrfs_root *root = btrfs_sb(sb);
509 int ret; 638 int ret;
510 639
640 trace_btrfs_sync_fs(wait);
641
511 if (!wait) { 642 if (!wait) {
512 filemap_flush(root->fs_info->btree_inode->i_mapping); 643 filemap_flush(root->fs_info->btree_inode->i_mapping);
513 return 0; 644 return 0;
@@ -517,6 +648,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
517 btrfs_wait_ordered_extents(root, 0, 0); 648 btrfs_wait_ordered_extents(root, 0, 0);
518 649
519 trans = btrfs_start_transaction(root, 0); 650 trans = btrfs_start_transaction(root, 0);
651 if (IS_ERR(trans))
652 return PTR_ERR(trans);
520 ret = btrfs_commit_transaction(trans, root); 653 ret = btrfs_commit_transaction(trans, root);
521 return ret; 654 return ret;
522} 655}
@@ -525,6 +658,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
525{ 658{
526 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 659 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
527 struct btrfs_fs_info *info = root->fs_info; 660 struct btrfs_fs_info *info = root->fs_info;
661 char *compress_type;
528 662
529 if (btrfs_test_opt(root, DEGRADED)) 663 if (btrfs_test_opt(root, DEGRADED))
530 seq_puts(seq, ",degraded"); 664 seq_puts(seq, ",degraded");
@@ -543,8 +677,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
543 if (info->thread_pool_size != min_t(unsigned long, 677 if (info->thread_pool_size != min_t(unsigned long,
544 num_online_cpus() + 2, 8)) 678 num_online_cpus() + 2, 8))
545 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 679 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
546 if (btrfs_test_opt(root, COMPRESS)) 680 if (btrfs_test_opt(root, COMPRESS)) {
547 seq_puts(seq, ",compress"); 681 if (info->compress_type == BTRFS_COMPRESS_ZLIB)
682 compress_type = "zlib";
683 else
684 compress_type = "lzo";
685 if (btrfs_test_opt(root, FORCE_COMPRESS))
686 seq_printf(seq, ",compress-force=%s", compress_type);
687 else
688 seq_printf(seq, ",compress=%s", compress_type);
689 }
548 if (btrfs_test_opt(root, NOSSD)) 690 if (btrfs_test_opt(root, NOSSD))
549 seq_puts(seq, ",nossd"); 691 seq_puts(seq, ",nossd");
550 if (btrfs_test_opt(root, SSD_SPREAD)) 692 if (btrfs_test_opt(root, SSD_SPREAD))
@@ -559,6 +701,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
559 seq_puts(seq, ",discard"); 701 seq_puts(seq, ",discard");
560 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 702 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
561 seq_puts(seq, ",noacl"); 703 seq_puts(seq, ",noacl");
704 if (btrfs_test_opt(root, SPACE_CACHE))
705 seq_puts(seq, ",space_cache");
706 if (btrfs_test_opt(root, CLEAR_CACHE))
707 seq_puts(seq, ",clear_cache");
708 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
709 seq_puts(seq, ",user_subvol_rm_allowed");
562 return 0; 710 return 0;
563} 711}
564 712
@@ -602,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
602 fmode_t mode = FMODE_READ; 750 fmode_t mode = FMODE_READ;
603 char *subvol_name = NULL; 751 char *subvol_name = NULL;
604 u64 subvol_objectid = 0; 752 u64 subvol_objectid = 0;
753 u64 subvol_rootid = 0;
605 int error = 0; 754 int error = 0;
606 755
607 if (!(flags & MS_RDONLY)) 756 if (!(flags & MS_RDONLY))
@@ -609,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
609 758
610 error = btrfs_parse_early_options(data, mode, fs_type, 759 error = btrfs_parse_early_options(data, mode, fs_type,
611 &subvol_name, &subvol_objectid, 760 &subvol_name, &subvol_objectid,
612 &fs_devices); 761 &subvol_rootid, &fs_devices);
613 if (error) 762 if (error)
614 return ERR_PTR(error); 763 return ERR_PTR(error);
615 764
@@ -655,6 +804,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
655 } 804 }
656 805
657 btrfs_close_devices(fs_devices); 806 btrfs_close_devices(fs_devices);
807 kfree(fs_info);
808 kfree(tree_root);
658 } else { 809 } else {
659 char b[BDEVNAME_SIZE]; 810 char b[BDEVNAME_SIZE];
660 811
@@ -671,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
671 s->s_flags |= MS_ACTIVE; 822 s->s_flags |= MS_ACTIVE;
672 } 823 }
673 824
674 root = get_default_root(s, subvol_objectid);
675 if (IS_ERR(root)) {
676 error = PTR_ERR(root);
677 deactivate_locked_super(s);
678 goto error_free_subvol_name;
679 }
680 /* if they gave us a subvolume name bind mount into that */ 825 /* if they gave us a subvolume name bind mount into that */
681 if (strcmp(subvol_name, ".")) { 826 if (strcmp(subvol_name, ".")) {
682 struct dentry *new_root; 827 struct dentry *new_root;
828
829 root = get_default_root(s, subvol_rootid);
830 if (IS_ERR(root)) {
831 error = PTR_ERR(root);
832 deactivate_locked_super(s);
833 goto error_free_subvol_name;
834 }
835
683 mutex_lock(&root->d_inode->i_mutex); 836 mutex_lock(&root->d_inode->i_mutex);
684 new_root = lookup_one_len(subvol_name, root, 837 new_root = lookup_one_len(subvol_name, root,
685 strlen(subvol_name)); 838 strlen(subvol_name));
@@ -700,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
700 } 853 }
701 dput(root); 854 dput(root);
702 root = new_root; 855 root = new_root;
856 } else {
857 root = get_default_root(s, subvol_objectid);
858 if (IS_ERR(root)) {
859 error = PTR_ERR(root);
860 deactivate_locked_super(s);
861 goto error_free_subvol_name;
862 }
703 } 863 }
704 864
705 kfree(subvol_name); 865 kfree(subvol_name);
@@ -753,6 +913,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
753 return 0; 913 return 0;
754} 914}
755 915
916/*
917 * The helper to calc the free space on the devices that can be used to store
918 * file data.
919 */
920static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
921{
922 struct btrfs_fs_info *fs_info = root->fs_info;
923 struct btrfs_device_info *devices_info;
924 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
925 struct btrfs_device *device;
926 u64 skip_space;
927 u64 type;
928 u64 avail_space;
929 u64 used_space;
930 u64 min_stripe_size;
931 int min_stripes = 1;
932 int i = 0, nr_devices;
933 int ret;
934
935 nr_devices = fs_info->fs_devices->rw_devices;
936 BUG_ON(!nr_devices);
937
938 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
939 GFP_NOFS);
940 if (!devices_info)
941 return -ENOMEM;
942
943 /* calc min stripe number for data space alloction */
944 type = btrfs_get_alloc_profile(root, 1);
945 if (type & BTRFS_BLOCK_GROUP_RAID0)
946 min_stripes = 2;
947 else if (type & BTRFS_BLOCK_GROUP_RAID1)
948 min_stripes = 2;
949 else if (type & BTRFS_BLOCK_GROUP_RAID10)
950 min_stripes = 4;
951
952 if (type & BTRFS_BLOCK_GROUP_DUP)
953 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
954 else
955 min_stripe_size = BTRFS_STRIPE_LEN;
956
957 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
958 if (!device->in_fs_metadata)
959 continue;
960
961 avail_space = device->total_bytes - device->bytes_used;
962
963 /* align with stripe_len */
964 do_div(avail_space, BTRFS_STRIPE_LEN);
965 avail_space *= BTRFS_STRIPE_LEN;
966
967 /*
968 * In order to avoid overwritting the superblock on the drive,
969 * btrfs starts at an offset of at least 1MB when doing chunk
970 * allocation.
971 */
972 skip_space = 1024 * 1024;
973
974 /* user can set the offset in fs_info->alloc_start. */
975 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
976 device->total_bytes)
977 skip_space = max(fs_info->alloc_start, skip_space);
978
979 /*
980 * btrfs can not use the free space in [0, skip_space - 1],
981 * we must subtract it from the total. In order to implement
982 * it, we account the used space in this range first.
983 */
984 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
985 &used_space);
986 if (ret) {
987 kfree(devices_info);
988 return ret;
989 }
990
991 /* calc the free space in [0, skip_space - 1] */
992 skip_space -= used_space;
993
994 /*
995 * we can use the free space in [0, skip_space - 1], subtract
996 * it from the total.
997 */
998 if (avail_space && avail_space >= skip_space)
999 avail_space -= skip_space;
1000 else
1001 avail_space = 0;
1002
1003 if (avail_space < min_stripe_size)
1004 continue;
1005
1006 devices_info[i].dev = device;
1007 devices_info[i].max_avail = avail_space;
1008
1009 i++;
1010 }
1011
1012 nr_devices = i;
1013
1014 btrfs_descending_sort_devices(devices_info, nr_devices);
1015
1016 i = nr_devices - 1;
1017 avail_space = 0;
1018 while (nr_devices >= min_stripes) {
1019 if (devices_info[i].max_avail >= min_stripe_size) {
1020 int j;
1021 u64 alloc_size;
1022
1023 avail_space += devices_info[i].max_avail * min_stripes;
1024 alloc_size = devices_info[i].max_avail;
1025 for (j = i + 1 - min_stripes; j <= i; j++)
1026 devices_info[j].max_avail -= alloc_size;
1027 }
1028 i--;
1029 nr_devices--;
1030 }
1031
1032 kfree(devices_info);
1033 *free_bytes = avail_space;
1034 return 0;
1035}
1036
756static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1037static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
757{ 1038{
758 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1039 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -760,17 +1041,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
760 struct list_head *head = &root->fs_info->space_info; 1041 struct list_head *head = &root->fs_info->space_info;
761 struct btrfs_space_info *found; 1042 struct btrfs_space_info *found;
762 u64 total_used = 0; 1043 u64 total_used = 0;
763 u64 total_used_data = 0; 1044 u64 total_free_data = 0;
764 int bits = dentry->d_sb->s_blocksize_bits; 1045 int bits = dentry->d_sb->s_blocksize_bits;
765 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1046 __be32 *fsid = (__be32 *)root->fs_info->fsid;
1047 int ret;
766 1048
1049 /* holding chunk_muext to avoid allocating new chunks */
1050 mutex_lock(&root->fs_info->chunk_mutex);
767 rcu_read_lock(); 1051 rcu_read_lock();
768 list_for_each_entry_rcu(found, head, list) { 1052 list_for_each_entry_rcu(found, head, list) {
769 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 1053 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
770 BTRFS_BLOCK_GROUP_SYSTEM)) 1054 total_free_data += found->disk_total - found->disk_used;
771 total_used_data += found->disk_total; 1055 total_free_data -=
772 else 1056 btrfs_account_ro_block_groups_free_space(found);
773 total_used_data += found->disk_used; 1057 }
1058
774 total_used += found->disk_used; 1059 total_used += found->disk_used;
775 } 1060 }
776 rcu_read_unlock(); 1061 rcu_read_unlock();
@@ -778,9 +1063,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
778 buf->f_namelen = BTRFS_NAME_LEN; 1063 buf->f_namelen = BTRFS_NAME_LEN;
779 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1064 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
780 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1065 buf->f_bfree = buf->f_blocks - (total_used >> bits);
781 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
782 buf->f_bsize = dentry->d_sb->s_blocksize; 1066 buf->f_bsize = dentry->d_sb->s_blocksize;
783 buf->f_type = BTRFS_SUPER_MAGIC; 1067 buf->f_type = BTRFS_SUPER_MAGIC;
1068 buf->f_bavail = total_free_data;
1069 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1070 if (ret) {
1071 mutex_unlock(&root->fs_info->chunk_mutex);
1072 return ret;
1073 }
1074 buf->f_bavail += total_free_data;
1075 buf->f_bavail = buf->f_bavail >> bits;
1076 mutex_unlock(&root->fs_info->chunk_mutex);
784 1077
785 /* We treat it as constant endianness (it doesn't matter _which_) 1078 /* We treat it as constant endianness (it doesn't matter _which_)
786 because we want the fsid to come out the same whether mounted 1079 because we want the fsid to come out the same whether mounted
@@ -897,10 +1190,14 @@ static int __init init_btrfs_fs(void)
897 if (err) 1190 if (err)
898 return err; 1191 return err;
899 1192
900 err = btrfs_init_cachep(); 1193 err = btrfs_init_compress();
901 if (err) 1194 if (err)
902 goto free_sysfs; 1195 goto free_sysfs;
903 1196
1197 err = btrfs_init_cachep();
1198 if (err)
1199 goto free_compress;
1200
904 err = extent_io_init(); 1201 err = extent_io_init();
905 if (err) 1202 if (err)
906 goto free_cachep; 1203 goto free_cachep;
@@ -928,6 +1225,8 @@ free_extent_io:
928 extent_io_exit(); 1225 extent_io_exit();
929free_cachep: 1226free_cachep:
930 btrfs_destroy_cachep(); 1227 btrfs_destroy_cachep();
1228free_compress:
1229 btrfs_exit_compress();
931free_sysfs: 1230free_sysfs:
932 btrfs_exit_sysfs(); 1231 btrfs_exit_sysfs();
933 return err; 1232 return err;
@@ -942,7 +1241,7 @@ static void __exit exit_btrfs_fs(void)
942 unregister_filesystem(&btrfs_fs_type); 1241 unregister_filesystem(&btrfs_fs_type);
943 btrfs_exit_sysfs(); 1242 btrfs_exit_sysfs();
944 btrfs_cleanup_fs_uuids(); 1243 btrfs_cleanup_fs_uuids();
945 btrfs_zlib_exit(); 1244 btrfs_exit_compress();
946} 1245}
947 1246
948module_init(init_btrfs_fs) 1247module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
32 32
33static noinline void put_transaction(struct btrfs_transaction *transaction) 33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{ 34{
35 WARN_ON(transaction->use_count == 0); 35 WARN_ON(atomic_read(&transaction->use_count) == 0);
36 transaction->use_count--; 36 if (atomic_dec_and_test(&transaction->use_count)) {
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction)); 37 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 38 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 39 }
@@ -57,16 +55,17 @@ static noinline int join_transaction(struct btrfs_root *root)
57 if (!cur_trans) { 55 if (!cur_trans) {
58 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 56 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
59 GFP_NOFS); 57 GFP_NOFS);
60 BUG_ON(!cur_trans); 58 if (!cur_trans)
59 return -ENOMEM;
61 root->fs_info->generation++; 60 root->fs_info->generation++;
62 cur_trans->num_writers = 1; 61 atomic_set(&cur_trans->num_writers, 1);
63 cur_trans->num_joined = 0; 62 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation; 63 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait); 64 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait); 65 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0; 66 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0; 67 cur_trans->blocked = 0;
69 cur_trans->use_count = 1; 68 atomic_set(&cur_trans->use_count, 1);
70 cur_trans->commit_done = 0; 69 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds(); 70 cur_trans->start_time = get_seconds();
72 71
@@ -87,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
87 root->fs_info->running_transaction = cur_trans; 86 root->fs_info->running_transaction = cur_trans;
88 spin_unlock(&root->fs_info->new_trans_lock); 87 spin_unlock(&root->fs_info->new_trans_lock);
89 } else { 88 } else {
90 cur_trans->num_writers++; 89 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 90 cur_trans->num_joined++;
92 } 91 }
93 92
@@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
144 cur_trans = root->fs_info->running_transaction; 143 cur_trans = root->fs_info->running_transaction;
145 if (cur_trans && cur_trans->blocked) { 144 if (cur_trans && cur_trans->blocked) {
146 DEFINE_WAIT(wait); 145 DEFINE_WAIT(wait);
147 cur_trans->use_count++; 146 atomic_inc(&cur_trans->use_count);
148 while (1) { 147 while (1) {
149 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 148 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
150 TASK_UNINTERRUPTIBLE); 149 TASK_UNINTERRUPTIBLE);
@@ -180,7 +179,11 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
180{ 179{
181 struct btrfs_trans_handle *h; 180 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -192,10 +195,15 @@ again:
192 wait_current_trans(root); 195 wait_current_trans(root);
193 196
194 ret = join_transaction(root); 197 ret = join_transaction(root);
195 BUG_ON(ret); 198 if (ret < 0) {
199 kmem_cache_free(btrfs_trans_handle_cachep, h);
200 if (type != TRANS_JOIN_NOLOCK)
201 mutex_unlock(&root->fs_info->trans_mutex);
202 return ERR_PTR(ret);
203 }
196 204
197 cur_trans = root->fs_info->running_transaction; 205 cur_trans = root->fs_info->running_transaction;
198 cur_trans->use_count++; 206 atomic_inc(&cur_trans->use_count);
199 if (type != TRANS_JOIN_NOLOCK) 207 if (type != TRANS_JOIN_NOLOCK)
200 mutex_unlock(&root->fs_info->trans_mutex); 208 mutex_unlock(&root->fs_info->trans_mutex);
201 209
@@ -215,10 +223,18 @@ again:
215 223
216 if (num_items > 0) { 224 if (num_items > 0) {
217 ret = btrfs_trans_reserve_metadata(h, root, num_items); 225 ret = btrfs_trans_reserve_metadata(h, root, num_items);
218 if (ret == -EAGAIN) { 226 if (ret == -EAGAIN && !retries) {
227 retries++;
219 btrfs_commit_transaction(h, root); 228 btrfs_commit_transaction(h, root);
220 goto again; 229 goto again;
230 } else if (ret == -EAGAIN) {
231 /*
232 * We have already retried and got EAGAIN, so really we
233 * don't have space, so set ret to -ENOSPC.
234 */
235 ret = -ENOSPC;
221 } 236 }
237
222 if (ret < 0) { 238 if (ret < 0) {
223 btrfs_end_transaction(h, root); 239 btrfs_end_transaction(h, root);
224 return ERR_PTR(ret); 240 return ERR_PTR(ret);
@@ -318,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
318 goto out_unlock; /* nothing committing|committed */ 334 goto out_unlock; /* nothing committing|committed */
319 } 335 }
320 336
321 cur_trans->use_count++; 337 atomic_inc(&cur_trans->use_count);
322 mutex_unlock(&root->fs_info->trans_mutex); 338 mutex_unlock(&root->fs_info->trans_mutex);
323 339
324 wait_for_commit(root, cur_trans); 340 wait_for_commit(root, cur_trans);
@@ -448,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
448 wake_up_process(info->transaction_kthread); 464 wake_up_process(info->transaction_kthread);
449 } 465 }
450 466
451 if (lock)
452 mutex_lock(&info->trans_mutex);
453 WARN_ON(cur_trans != info->running_transaction); 467 WARN_ON(cur_trans != info->running_transaction);
454 WARN_ON(cur_trans->num_writers < 1); 468 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
455 cur_trans->num_writers--; 469 atomic_dec(&cur_trans->num_writers);
456 470
457 smp_mb(); 471 smp_mb();
458 if (waitqueue_active(&cur_trans->writer_wait)) 472 if (waitqueue_active(&cur_trans->writer_wait))
459 wake_up(&cur_trans->writer_wait); 473 wake_up(&cur_trans->writer_wait);
460 put_transaction(cur_trans); 474 put_transaction(cur_trans);
461 if (lock)
462 mutex_unlock(&info->trans_mutex);
463 475
464 if (current->journal_info == trans) 476 if (current->journal_info == trans)
465 current->journal_info = NULL; 477 current->journal_info = NULL;
@@ -910,6 +922,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
910 u64 to_reserve = 0; 922 u64 to_reserve = 0;
911 u64 index = 0; 923 u64 index = 0;
912 u64 objectid; 924 u64 objectid;
925 u64 root_flags;
913 926
914 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 927 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
915 if (!new_root_item) { 928 if (!new_root_item) {
@@ -966,6 +979,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
966 record_root_in_trans(trans, root); 979 record_root_in_trans(trans, root);
967 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 980 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
968 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 981 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
982 btrfs_check_and_init_root_item(new_root_item);
983
984 root_flags = btrfs_root_flags(new_root_item);
985 if (pending->readonly)
986 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
987 else
988 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
989 btrfs_set_root_flags(new_root_item, root_flags);
969 990
970 old = btrfs_lock_root_node(root); 991 old = btrfs_lock_root_node(root);
971 btrfs_cow_block(trans, root, old, NULL, 0, &old); 992 btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -1145,16 +1166,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1145 struct btrfs_transaction *cur_trans; 1166 struct btrfs_transaction *cur_trans;
1146 1167
1147 ac = kmalloc(sizeof(*ac), GFP_NOFS); 1168 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1148 BUG_ON(!ac); 1169 if (!ac)
1170 return -ENOMEM;
1149 1171
1150 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1172 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1151 ac->root = root; 1173 ac->root = root;
1152 ac->newtrans = btrfs_join_transaction(root, 0); 1174 ac->newtrans = btrfs_join_transaction(root, 0);
1175 if (IS_ERR(ac->newtrans)) {
1176 int err = PTR_ERR(ac->newtrans);
1177 kfree(ac);
1178 return err;
1179 }
1153 1180
1154 /* take transaction reference */ 1181 /* take transaction reference */
1155 mutex_lock(&root->fs_info->trans_mutex); 1182 mutex_lock(&root->fs_info->trans_mutex);
1156 cur_trans = trans->transaction; 1183 cur_trans = trans->transaction;
1157 cur_trans->use_count++; 1184 atomic_inc(&cur_trans->use_count);
1158 mutex_unlock(&root->fs_info->trans_mutex); 1185 mutex_unlock(&root->fs_info->trans_mutex);
1159 1186
1160 btrfs_end_transaction(trans, root); 1187 btrfs_end_transaction(trans, root);
@@ -1213,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1213 1240
1214 mutex_lock(&root->fs_info->trans_mutex); 1241 mutex_lock(&root->fs_info->trans_mutex);
1215 if (cur_trans->in_commit) { 1242 if (cur_trans->in_commit) {
1216 cur_trans->use_count++; 1243 atomic_inc(&cur_trans->use_count);
1217 mutex_unlock(&root->fs_info->trans_mutex); 1244 mutex_unlock(&root->fs_info->trans_mutex);
1218 btrfs_end_transaction(trans, root); 1245 btrfs_end_transaction(trans, root);
1219 1246
@@ -1235,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1235 prev_trans = list_entry(cur_trans->list.prev, 1262 prev_trans = list_entry(cur_trans->list.prev,
1236 struct btrfs_transaction, list); 1263 struct btrfs_transaction, list);
1237 if (!prev_trans->commit_done) { 1264 if (!prev_trans->commit_done) {
1238 prev_trans->use_count++; 1265 atomic_inc(&prev_trans->use_count);
1239 mutex_unlock(&root->fs_info->trans_mutex); 1266 mutex_unlock(&root->fs_info->trans_mutex);
1240 1267
1241 wait_for_commit(root, prev_trans); 1268 wait_for_commit(root, prev_trans);
@@ -1276,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1276 TASK_UNINTERRUPTIBLE); 1303 TASK_UNINTERRUPTIBLE);
1277 1304
1278 smp_mb(); 1305 smp_mb();
1279 if (cur_trans->num_writers > 1) 1306 if (atomic_read(&cur_trans->num_writers) > 1)
1280 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1307 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1281 else if (should_grow) 1308 else if (should_grow)
1282 schedule_timeout(1); 1309 schedule_timeout(1);
1283 1310
1284 mutex_lock(&root->fs_info->trans_mutex); 1311 mutex_lock(&root->fs_info->trans_mutex);
1285 finish_wait(&cur_trans->writer_wait, &wait); 1312 finish_wait(&cur_trans->writer_wait, &wait);
1286 } while (cur_trans->num_writers > 1 || 1313 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1287 (should_grow && cur_trans->num_joined != joined)); 1314 (should_grow && cur_trans->num_joined != joined));
1288 1315
1289 ret = create_pending_snapshots(trans, root->fs_info); 1316 ret = create_pending_snapshots(trans, root->fs_info);
@@ -1370,9 +1397,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1370 1397
1371 wake_up(&cur_trans->commit_wait); 1398 wake_up(&cur_trans->commit_wait);
1372 1399
1400 list_del_init(&cur_trans->list);
1373 put_transaction(cur_trans); 1401 put_transaction(cur_trans);
1374 put_transaction(cur_trans); 1402 put_transaction(cur_trans);
1375 1403
1404 trace_btrfs_transaction_commit(root);
1405
1376 mutex_unlock(&root->fs_info->trans_mutex); 1406 mutex_unlock(&root->fs_info->trans_mutex);
1377 1407
1378 if (current->journal_info == trans) 1408 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
27 * total writers in this transaction, it must be zero before the 27 * total writers in this transaction, it must be zero before the
28 * transaction can end 28 * transaction can end
29 */ 29 */
30 unsigned long num_writers; 30 atomic_t num_writers;
31 31
32 unsigned long num_joined; 32 unsigned long num_joined;
33 int in_commit; 33 int in_commit;
34 int use_count; 34 atomic_t use_count;
35 int commit_done; 35 int commit_done;
36 int blocked; 36 int blocked;
37 struct list_head list; 37 struct list_head list;
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..c50271ad3157 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
338 } 338 }
339 dst_copy = kmalloc(item_size, GFP_NOFS); 339 dst_copy = kmalloc(item_size, GFP_NOFS);
340 src_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS);
341 if (!dst_copy || !src_copy) {
342 btrfs_release_path(root, path);
343 kfree(dst_copy);
344 kfree(src_copy);
345 return -ENOMEM;
346 }
341 347
342 read_extent_buffer(eb, src_copy, src_ptr, item_size); 348 read_extent_buffer(eb, src_copy, src_ptr, item_size);
343 349
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
665 btrfs_dir_item_key_to_cpu(leaf, di, &location); 671 btrfs_dir_item_key_to_cpu(leaf, di, &location);
666 name_len = btrfs_dir_name_len(leaf, di); 672 name_len = btrfs_dir_name_len(leaf, di);
667 name = kmalloc(name_len, GFP_NOFS); 673 name = kmalloc(name_len, GFP_NOFS);
674 if (!name)
675 return -ENOMEM;
676
668 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
669 btrfs_release_path(root, path); 678 btrfs_release_path(root, path);
670 679
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
744 int match = 0; 753 int match = 0;
745 754
746 path = btrfs_alloc_path(); 755 path = btrfs_alloc_path();
756 if (!path)
757 return -ENOMEM;
758
747 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
748 if (ret != 0) 760 if (ret != 0)
749 goto out; 761 goto out;
@@ -787,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
787 struct inode *dir; 799 struct inode *dir;
788 int ret; 800 int ret;
789 struct btrfs_inode_ref *ref; 801 struct btrfs_inode_ref *ref;
790 struct btrfs_dir_item *di;
791 struct inode *inode; 802 struct inode *inode;
792 char *name; 803 char *name;
793 int namelen; 804 int namelen;
794 unsigned long ref_ptr; 805 unsigned long ref_ptr;
795 unsigned long ref_end; 806 unsigned long ref_end;
807 int search_done = 0;
796 808
797 /* 809 /*
798 * it is possible that we didn't log all the parent directories 810 * it is possible that we didn't log all the parent directories
@@ -833,7 +845,10 @@ again:
833 * existing back reference, and we don't want to create 845 * existing back reference, and we don't want to create
834 * dangling pointers in the directory. 846 * dangling pointers in the directory.
835 */ 847 */
836conflict_again: 848
849 if (search_done)
850 goto insert;
851
837 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 852 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
838 if (ret == 0) { 853 if (ret == 0) {
839 char *victim_name; 854 char *victim_name;
@@ -874,37 +889,21 @@ conflict_again:
874 ret = btrfs_unlink_inode(trans, root, dir, 889 ret = btrfs_unlink_inode(trans, root, dir,
875 inode, victim_name, 890 inode, victim_name,
876 victim_name_len); 891 victim_name_len);
877 kfree(victim_name);
878 btrfs_release_path(root, path);
879 goto conflict_again;
880 } 892 }
881 kfree(victim_name); 893 kfree(victim_name);
882 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
883 } 895 }
884 BUG_ON(ret); 896 BUG_ON(ret);
885 }
886 btrfs_release_path(root, path);
887 897
888 /* look for a conflicting sequence number */ 898 /*
889 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 899 * NOTE: we have searched root tree and checked the
890 btrfs_inode_ref_index(eb, ref), 900 * coresponding ref, it does not need to check again.
891 name, namelen, 0); 901 */
892 if (di && !IS_ERR(di)) { 902 search_done = 1;
893 ret = drop_one_dir_item(trans, root, path, dir, di);
894 BUG_ON(ret);
895 }
896 btrfs_release_path(root, path);
897
898
899 /* look for a conflicting name */
900 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
901 name, namelen, 0);
902 if (di && !IS_ERR(di)) {
903 ret = drop_one_dir_item(trans, root, path, dir, di);
904 BUG_ON(ret);
905 } 903 }
906 btrfs_release_path(root, path); 904 btrfs_release_path(root, path);
907 905
906insert:
908 /* insert our name */ 907 /* insert our name */
909 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 908 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
910 btrfs_inode_ref_index(eb, ref)); 909 btrfs_inode_ref_index(eb, ref));
@@ -967,6 +966,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
967 key.offset = (u64)-1; 966 key.offset = (u64)-1;
968 967
969 path = btrfs_alloc_path(); 968 path = btrfs_alloc_path();
969 if (!path)
970 return -ENOMEM;
970 971
971 while (1) { 972 while (1) {
972 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 973 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1179,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1178 1179
1179 name_len = btrfs_dir_name_len(eb, di); 1180 name_len = btrfs_dir_name_len(eb, di);
1180 name = kmalloc(name_len, GFP_NOFS); 1181 name = kmalloc(name_len, GFP_NOFS);
1182 if (!name)
1183 return -ENOMEM;
1184
1181 log_type = btrfs_dir_type(eb, di); 1185 log_type = btrfs_dir_type(eb, di);
1182 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1186 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1183 name_len); 1187 name_len);
@@ -1269,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1269 ptr_end = ptr + item_size; 1273 ptr_end = ptr + item_size;
1270 while (ptr < ptr_end) { 1274 while (ptr < ptr_end) {
1271 di = (struct btrfs_dir_item *)ptr; 1275 di = (struct btrfs_dir_item *)ptr;
1276 if (verify_dir_item(root, eb, di))
1277 return -EIO;
1272 name_len = btrfs_dir_name_len(eb, di); 1278 name_len = btrfs_dir_name_len(eb, di);
1273 ret = replay_one_name(trans, root, path, eb, di, key); 1279 ret = replay_one_name(trans, root, path, eb, di, key);
1274 BUG_ON(ret); 1280 BUG_ON(ret);
@@ -1395,6 +1401,11 @@ again:
1395 ptr_end = ptr + item_size; 1401 ptr_end = ptr + item_size;
1396 while (ptr < ptr_end) { 1402 while (ptr < ptr_end) {
1397 di = (struct btrfs_dir_item *)ptr; 1403 di = (struct btrfs_dir_item *)ptr;
1404 if (verify_dir_item(root, eb, di)) {
1405 ret = -EIO;
1406 goto out;
1407 }
1408
1398 name_len = btrfs_dir_name_len(eb, di); 1409 name_len = btrfs_dir_name_len(eb, di);
1399 name = kmalloc(name_len, GFP_NOFS); 1410 name = kmalloc(name_len, GFP_NOFS);
1400 if (!name) { 1411 if (!name) {
@@ -1692,6 +1703,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1692 root_owner = btrfs_header_owner(parent); 1703 root_owner = btrfs_header_owner(parent);
1693 1704
1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1705 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1706 if (!next)
1707 return -ENOMEM;
1695 1708
1696 if (*level == 1) { 1709 if (*level == 1) {
1697 wc->process_func(root, next, wc, ptr_gen); 1710 wc->process_func(root, next, wc, ptr_gen);
@@ -1802,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1802 int orig_level; 1815 int orig_level;
1803 1816
1804 path = btrfs_alloc_path(); 1817 path = btrfs_alloc_path();
1805 BUG_ON(!path); 1818 if (!path)
1819 return -ENOMEM;
1806 1820
1807 level = btrfs_header_level(log->node); 1821 level = btrfs_header_level(log->node);
1808 orig_level = level; 1822 orig_level = level;
@@ -2032,6 +2046,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2032 wait_log_commit(trans, log_root_tree, 2046 wait_log_commit(trans, log_root_tree,
2033 log_root_tree->log_transid); 2047 log_root_tree->log_transid);
2034 mutex_unlock(&log_root_tree->log_mutex); 2048 mutex_unlock(&log_root_tree->log_mutex);
2049 ret = 0;
2035 goto out; 2050 goto out;
2036 } 2051 }
2037 atomic_set(&log_root_tree->log_commit[index2], 1); 2052 atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2111,7 @@ out:
2096 smp_mb(); 2111 smp_mb();
2097 if (waitqueue_active(&root->log_commit_wait[index1])) 2112 if (waitqueue_active(&root->log_commit_wait[index1]))
2098 wake_up(&root->log_commit_wait[index1]); 2113 wake_up(&root->log_commit_wait[index1]);
2099 return 0; 2114 return ret;
2100} 2115}
2101 2116
2102static void free_log_tree(struct btrfs_trans_handle *trans, 2117static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2209,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2194 2209
2195 log = root->log_root; 2210 log = root->log_root;
2196 path = btrfs_alloc_path(); 2211 path = btrfs_alloc_path();
2212 if (!path)
2213 return -ENOMEM;
2214
2197 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2215 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2198 name, name_len, -1); 2216 name, name_len, -1);
2199 if (IS_ERR(di)) { 2217 if (IS_ERR(di)) {
@@ -2594,6 +2612,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2594 2612
2595 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2613 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2596 nr * sizeof(u32), GFP_NOFS); 2614 nr * sizeof(u32), GFP_NOFS);
2615 if (!ins_data)
2616 return -ENOMEM;
2617
2597 ins_sizes = (u32 *)ins_data; 2618 ins_sizes = (u32 *)ins_data;
2598 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2619 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2599 2620
@@ -2725,7 +2746,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2725 log = root->log_root; 2746 log = root->log_root;
2726 2747
2727 path = btrfs_alloc_path(); 2748 path = btrfs_alloc_path();
2749 if (!path)
2750 return -ENOMEM;
2728 dst_path = btrfs_alloc_path(); 2751 dst_path = btrfs_alloc_path();
2752 if (!dst_path) {
2753 btrfs_free_path(path);
2754 return -ENOMEM;
2755 }
2729 2756
2730 min_key.objectid = inode->i_ino; 2757 min_key.objectid = inode->i_ino;
2731 min_key.type = BTRFS_INODE_ITEM_KEY; 2758 min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3075,16 +3102,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3075 .stage = 0, 3102 .stage = 0,
3076 }; 3103 };
3077 3104
3078 fs_info->log_root_recovering = 1;
3079 path = btrfs_alloc_path(); 3105 path = btrfs_alloc_path();
3080 BUG_ON(!path); 3106 if (!path)
3107 return -ENOMEM;
3108
3109 fs_info->log_root_recovering = 1;
3081 3110
3082 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3111 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3112 BUG_ON(IS_ERR(trans));
3083 3113
3084 wc.trans = trans; 3114 wc.trans = trans;
3085 wc.pin = 1; 3115 wc.pin = 1;
3086 3116
3087 walk_log_tree(trans, log_root_tree, &wc); 3117 ret = walk_log_tree(trans, log_root_tree, &wc);
3118 BUG_ON(ret);
3088 3119
3089again: 3120again:
3090 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3121 key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3108,8 +3139,7 @@ again:
3108 3139
3109 log = btrfs_read_fs_root_no_radix(log_root_tree, 3140 log = btrfs_read_fs_root_no_radix(log_root_tree,
3110 &found_key); 3141 &found_key);
3111 BUG_ON(!log); 3142 BUG_ON(IS_ERR(log));
3112
3113 3143
3114 tmp_key.objectid = found_key.offset; 3144 tmp_key.objectid = found_key.offset;
3115 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3145 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1718e1a5c320..309a57b9fc85 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -32,17 +33,6 @@
32#include "volumes.h" 33#include "volumes.h"
33#include "async-thread.h" 34#include "async-thread.h"
34 35
35struct map_lookup {
36 u64 type;
37 int io_align;
38 int io_width;
39 int stripe_len;
40 int sector_size;
41 int num_stripes;
42 int sub_stripes;
43 struct btrfs_bio_stripe stripes[];
44};
45
46static int init_first_rw_device(struct btrfs_trans_handle *trans, 36static int init_first_rw_device(struct btrfs_trans_handle *trans,
47 struct btrfs_root *root, 37 struct btrfs_root *root,
48 struct btrfs_device *device); 38 struct btrfs_device *device);
@@ -161,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
161 struct bio *cur; 151 struct bio *cur;
162 int again = 0; 152 int again = 0;
163 unsigned long num_run; 153 unsigned long num_run;
164 unsigned long num_sync_run;
165 unsigned long batch_run = 0; 154 unsigned long batch_run = 0;
166 unsigned long limit; 155 unsigned long limit;
167 unsigned long last_waited = 0; 156 unsigned long last_waited = 0;
@@ -172,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
172 limit = btrfs_async_submit_limit(fs_info); 161 limit = btrfs_async_submit_limit(fs_info);
173 limit = limit * 2 / 3; 162 limit = limit * 2 / 3;
174 163
175 /* we want to make sure that every time we switch from the sync
176 * list to the normal list, we unplug
177 */
178 num_sync_run = 0;
179
180loop: 164loop:
181 spin_lock(&device->io_lock); 165 spin_lock(&device->io_lock);
182 166
@@ -222,15 +206,6 @@ loop_lock:
222 206
223 spin_unlock(&device->io_lock); 207 spin_unlock(&device->io_lock);
224 208
225 /*
226 * if we're doing the regular priority list, make sure we unplug
227 * for any high prio bios we've sent down
228 */
229 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
230 num_sync_run = 0;
231 blk_run_backing_dev(bdi, NULL);
232 }
233
234 while (pending) { 209 while (pending) {
235 210
236 rmb(); 211 rmb();
@@ -258,19 +233,11 @@ loop_lock:
258 233
259 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 234 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260 235
261 if (cur->bi_rw & REQ_SYNC)
262 num_sync_run++;
263
264 submit_bio(cur->bi_rw, cur); 236 submit_bio(cur->bi_rw, cur);
265 num_run++; 237 num_run++;
266 batch_run++; 238 batch_run++;
267 if (need_resched()) { 239 if (need_resched())
268 if (num_sync_run) {
269 blk_run_backing_dev(bdi, NULL);
270 num_sync_run = 0;
271 }
272 cond_resched(); 240 cond_resched();
273 }
274 241
275 /* 242 /*
276 * we made progress, there is more work to do and the bdi 243 * we made progress, there is more work to do and the bdi
@@ -303,13 +270,8 @@ loop_lock:
303 * against it before looping 270 * against it before looping
304 */ 271 */
305 last_waited = ioc->last_waited; 272 last_waited = ioc->last_waited;
306 if (need_resched()) { 273 if (need_resched())
307 if (num_sync_run) {
308 blk_run_backing_dev(bdi, NULL);
309 num_sync_run = 0;
310 }
311 cond_resched(); 274 cond_resched();
312 }
313 continue; 275 continue;
314 } 276 }
315 spin_lock(&device->io_lock); 277 spin_lock(&device->io_lock);
@@ -322,22 +284,6 @@ loop_lock:
322 } 284 }
323 } 285 }
324 286
325 if (num_sync_run) {
326 num_sync_run = 0;
327 blk_run_backing_dev(bdi, NULL);
328 }
329 /*
330 * IO has already been through a long path to get here. Checksumming,
331 * async helper threads, perhaps compression. We've done a pretty
332 * good job of collecting a batch of IO and should just unplug
333 * the device right away.
334 *
335 * This will help anyone who is waiting on the IO, they might have
336 * already unplugged, but managed to do so before the bio they
337 * cared about found its way down here.
338 */
339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched(); 287 cond_resched();
342 if (again) 288 if (again)
343 goto loop; 289 goto loop;
@@ -600,8 +546,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
600 set_blocksize(bdev, 4096); 546 set_blocksize(bdev, 4096);
601 547
602 bh = btrfs_read_dev_super(bdev); 548 bh = btrfs_read_dev_super(bdev);
603 if (!bh) 549 if (!bh) {
550 ret = -EINVAL;
604 goto error_close; 551 goto error_close;
552 }
605 553
606 disk_super = (struct btrfs_super_block *)bh->b_data; 554 disk_super = (struct btrfs_super_block *)bh->b_data;
607 devid = btrfs_stack_device_id(&disk_super->dev_item); 555 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -703,7 +651,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
703 goto error_close; 651 goto error_close;
704 bh = btrfs_read_dev_super(bdev); 652 bh = btrfs_read_dev_super(bdev);
705 if (!bh) { 653 if (!bh) {
706 ret = -EIO; 654 ret = -EINVAL;
707 goto error_close; 655 goto error_close;
708 } 656 }
709 disk_super = (struct btrfs_super_block *)bh->b_data; 657 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -729,59 +677,167 @@ error:
729 return ret; 677 return ret;
730} 678}
731 679
680/* helper to account the used device space in the range */
681int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
682 u64 end, u64 *length)
683{
684 struct btrfs_key key;
685 struct btrfs_root *root = device->dev_root;
686 struct btrfs_dev_extent *dev_extent;
687 struct btrfs_path *path;
688 u64 extent_end;
689 int ret;
690 int slot;
691 struct extent_buffer *l;
692
693 *length = 0;
694
695 if (start >= device->total_bytes)
696 return 0;
697
698 path = btrfs_alloc_path();
699 if (!path)
700 return -ENOMEM;
701 path->reada = 2;
702
703 key.objectid = device->devid;
704 key.offset = start;
705 key.type = BTRFS_DEV_EXTENT_KEY;
706
707 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
708 if (ret < 0)
709 goto out;
710 if (ret > 0) {
711 ret = btrfs_previous_item(root, path, key.objectid, key.type);
712 if (ret < 0)
713 goto out;
714 }
715
716 while (1) {
717 l = path->nodes[0];
718 slot = path->slots[0];
719 if (slot >= btrfs_header_nritems(l)) {
720 ret = btrfs_next_leaf(root, path);
721 if (ret == 0)
722 continue;
723 if (ret < 0)
724 goto out;
725
726 break;
727 }
728 btrfs_item_key_to_cpu(l, &key, slot);
729
730 if (key.objectid < device->devid)
731 goto next;
732
733 if (key.objectid > device->devid)
734 break;
735
736 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
737 goto next;
738
739 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
740 extent_end = key.offset + btrfs_dev_extent_length(l,
741 dev_extent);
742 if (key.offset <= start && extent_end > end) {
743 *length = end - start + 1;
744 break;
745 } else if (key.offset <= start && extent_end > start)
746 *length += extent_end - start;
747 else if (key.offset > start && extent_end <= end)
748 *length += extent_end - key.offset;
749 else if (key.offset > start && key.offset <= end) {
750 *length += end - key.offset + 1;
751 break;
752 } else if (key.offset > end)
753 break;
754
755next:
756 path->slots[0]++;
757 }
758 ret = 0;
759out:
760 btrfs_free_path(path);
761 return ret;
762}
763
732/* 764/*
765 * find_free_dev_extent - find free space in the specified device
766 * @trans: transaction handler
767 * @device: the device which we search the free space in
768 * @num_bytes: the size of the free space that we need
769 * @start: store the start of the free space.
770 * @len: the size of the free space. that we find, or the size of the max
771 * free space if we don't find suitable free space
772 *
733 * this uses a pretty simple search, the expectation is that it is 773 * this uses a pretty simple search, the expectation is that it is
734 * called very infrequently and that a given device has a small number 774 * called very infrequently and that a given device has a small number
735 * of extents 775 * of extents
776 *
777 * @start is used to store the start of the free space if we find. But if we
778 * don't find suitable free space, it will be used to store the start position
779 * of the max free space.
780 *
781 * @len is used to store the size of the free space that we find.
782 * But if we don't find suitable free space, it is used to store the size of
783 * the max free space.
736 */ 784 */
737int find_free_dev_extent(struct btrfs_trans_handle *trans, 785int find_free_dev_extent(struct btrfs_trans_handle *trans,
738 struct btrfs_device *device, u64 num_bytes, 786 struct btrfs_device *device, u64 num_bytes,
739 u64 *start, u64 *max_avail) 787 u64 *start, u64 *len)
740{ 788{
741 struct btrfs_key key; 789 struct btrfs_key key;
742 struct btrfs_root *root = device->dev_root; 790 struct btrfs_root *root = device->dev_root;
743 struct btrfs_dev_extent *dev_extent = NULL; 791 struct btrfs_dev_extent *dev_extent;
744 struct btrfs_path *path; 792 struct btrfs_path *path;
745 u64 hole_size = 0; 793 u64 hole_size;
746 u64 last_byte = 0; 794 u64 max_hole_start;
747 u64 search_start = 0; 795 u64 max_hole_size;
796 u64 extent_end;
797 u64 search_start;
748 u64 search_end = device->total_bytes; 798 u64 search_end = device->total_bytes;
749 int ret; 799 int ret;
750 int slot = 0; 800 int slot;
751 int start_found;
752 struct extent_buffer *l; 801 struct extent_buffer *l;
753 802
754 path = btrfs_alloc_path();
755 if (!path)
756 return -ENOMEM;
757 path->reada = 2;
758 start_found = 0;
759
760 /* FIXME use last free of some kind */ 803 /* FIXME use last free of some kind */
761 804
762 /* we don't want to overwrite the superblock on the drive, 805 /* we don't want to overwrite the superblock on the drive,
763 * so we make sure to start at an offset of at least 1MB 806 * so we make sure to start at an offset of at least 1MB
764 */ 807 */
765 search_start = max((u64)1024 * 1024, search_start); 808 search_start = 1024 * 1024;
766 809
767 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 810 if (root->fs_info->alloc_start + num_bytes <= search_end)
768 search_start = max(root->fs_info->alloc_start, search_start); 811 search_start = max(root->fs_info->alloc_start, search_start);
769 812
813 max_hole_start = search_start;
814 max_hole_size = 0;
815
816 if (search_start >= search_end) {
817 ret = -ENOSPC;
818 goto error;
819 }
820
821 path = btrfs_alloc_path();
822 if (!path) {
823 ret = -ENOMEM;
824 goto error;
825 }
826 path->reada = 2;
827
770 key.objectid = device->devid; 828 key.objectid = device->devid;
771 key.offset = search_start; 829 key.offset = search_start;
772 key.type = BTRFS_DEV_EXTENT_KEY; 830 key.type = BTRFS_DEV_EXTENT_KEY;
831
773 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 832 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
774 if (ret < 0) 833 if (ret < 0)
775 goto error; 834 goto out;
776 if (ret > 0) { 835 if (ret > 0) {
777 ret = btrfs_previous_item(root, path, key.objectid, key.type); 836 ret = btrfs_previous_item(root, path, key.objectid, key.type);
778 if (ret < 0) 837 if (ret < 0)
779 goto error; 838 goto out;
780 if (ret > 0)
781 start_found = 1;
782 } 839 }
783 l = path->nodes[0]; 840
784 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
785 while (1) { 841 while (1) {
786 l = path->nodes[0]; 842 l = path->nodes[0];
787 slot = path->slots[0]; 843 slot = path->slots[0];
@@ -790,24 +846,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
790 if (ret == 0) 846 if (ret == 0)
791 continue; 847 continue;
792 if (ret < 0) 848 if (ret < 0)
793 goto error; 849 goto out;
794no_more_items: 850
795 if (!start_found) { 851 break;
796 if (search_start >= search_end) {
797 ret = -ENOSPC;
798 goto error;
799 }
800 *start = search_start;
801 start_found = 1;
802 goto check_pending;
803 }
804 *start = last_byte > search_start ?
805 last_byte : search_start;
806 if (search_end <= *start) {
807 ret = -ENOSPC;
808 goto error;
809 }
810 goto check_pending;
811 } 852 }
812 btrfs_item_key_to_cpu(l, &key, slot); 853 btrfs_item_key_to_cpu(l, &key, slot);
813 854
@@ -815,48 +856,62 @@ no_more_items:
815 goto next; 856 goto next;
816 857
817 if (key.objectid > device->devid) 858 if (key.objectid > device->devid)
818 goto no_more_items; 859 break;
819 860
820 if (key.offset >= search_start && key.offset > last_byte && 861 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
821 start_found) { 862 goto next;
822 if (last_byte < search_start)
823 last_byte = search_start;
824 hole_size = key.offset - last_byte;
825 863
826 if (hole_size > *max_avail) 864 if (key.offset > search_start) {
827 *max_avail = hole_size; 865 hole_size = key.offset - search_start;
828 866
829 if (key.offset > last_byte && 867 if (hole_size > max_hole_size) {
830 hole_size >= num_bytes) { 868 max_hole_start = search_start;
831 *start = last_byte; 869 max_hole_size = hole_size;
832 goto check_pending; 870 }
871
872 /*
873 * If this free space is greater than which we need,
874 * it must be the max free space that we have found
875 * until now, so max_hole_start must point to the start
876 * of this free space and the length of this free space
877 * is stored in max_hole_size. Thus, we return
878 * max_hole_start and max_hole_size and go back to the
879 * caller.
880 */
881 if (hole_size >= num_bytes) {
882 ret = 0;
883 goto out;
833 } 884 }
834 } 885 }
835 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
836 goto next;
837 886
838 start_found = 1;
839 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 887 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
840 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 888 extent_end = key.offset + btrfs_dev_extent_length(l,
889 dev_extent);
890 if (extent_end > search_start)
891 search_start = extent_end;
841next: 892next:
842 path->slots[0]++; 893 path->slots[0]++;
843 cond_resched(); 894 cond_resched();
844 } 895 }
845check_pending:
846 /* we have to make sure we didn't find an extent that has already
847 * been allocated by the map tree or the original allocation
848 */
849 BUG_ON(*start < search_start);
850 896
851 if (*start + num_bytes > search_end) { 897 hole_size = search_end- search_start;
852 ret = -ENOSPC; 898 if (hole_size > max_hole_size) {
853 goto error; 899 max_hole_start = search_start;
900 max_hole_size = hole_size;
854 } 901 }
855 /* check for pending inserts here */
856 ret = 0;
857 902
858error: 903 /* See above. */
904 if (hole_size < num_bytes)
905 ret = -ENOSPC;
906 else
907 ret = 0;
908
909out:
859 btrfs_free_path(path); 910 btrfs_free_path(path);
911error:
912 *start = max_hole_start;
913 if (len)
914 *len = max_hole_size;
860 return ret; 915 return ret;
861} 916}
862 917
@@ -1103,6 +1158,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1103 return -ENOMEM; 1158 return -ENOMEM;
1104 1159
1105 trans = btrfs_start_transaction(root, 0); 1160 trans = btrfs_start_transaction(root, 0);
1161 if (IS_ERR(trans)) {
1162 btrfs_free_path(path);
1163 return PTR_ERR(trans);
1164 }
1106 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1165 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1107 key.type = BTRFS_DEV_ITEM_KEY; 1166 key.type = BTRFS_DEV_ITEM_KEY;
1108 key.offset = device->devid; 1167 key.offset = device->devid;
@@ -1196,7 +1255,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1196 set_blocksize(bdev, 4096); 1255 set_blocksize(bdev, 4096);
1197 bh = btrfs_read_dev_super(bdev); 1256 bh = btrfs_read_dev_super(bdev);
1198 if (!bh) { 1257 if (!bh) {
1199 ret = -EIO; 1258 ret = -EINVAL;
1200 goto error_close; 1259 goto error_close;
1201 } 1260 }
1202 disk_super = (struct btrfs_super_block *)bh->b_data; 1261 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1224,11 +1283,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1224 1283
1225 ret = btrfs_shrink_device(device, 0); 1284 ret = btrfs_shrink_device(device, 0);
1226 if (ret) 1285 if (ret)
1227 goto error_brelse; 1286 goto error_undo;
1228 1287
1229 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1288 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1230 if (ret) 1289 if (ret)
1231 goto error_brelse; 1290 goto error_undo;
1232 1291
1233 device->in_fs_metadata = 0; 1292 device->in_fs_metadata = 0;
1234 1293
@@ -1302,6 +1361,13 @@ out:
1302 mutex_unlock(&root->fs_info->volume_mutex); 1361 mutex_unlock(&root->fs_info->volume_mutex);
1303 mutex_unlock(&uuid_mutex); 1362 mutex_unlock(&uuid_mutex);
1304 return ret; 1363 return ret;
1364error_undo:
1365 if (device->writeable) {
1366 list_add(&device->dev_alloc_list,
1367 &root->fs_info->fs_devices->alloc_list);
1368 root->fs_info->fs_devices->rw_devices++;
1369 }
1370 goto error_brelse;
1305} 1371}
1306 1372
1307/* 1373/*
@@ -1491,11 +1557,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1491 1557
1492 ret = find_next_devid(root, &device->devid); 1558 ret = find_next_devid(root, &device->devid);
1493 if (ret) { 1559 if (ret) {
1560 kfree(device->name);
1494 kfree(device); 1561 kfree(device);
1495 goto error; 1562 goto error;
1496 } 1563 }
1497 1564
1498 trans = btrfs_start_transaction(root, 0); 1565 trans = btrfs_start_transaction(root, 0);
1566 if (IS_ERR(trans)) {
1567 kfree(device->name);
1568 kfree(device);
1569 ret = PTR_ERR(trans);
1570 goto error;
1571 }
1572
1499 lock_chunks(root); 1573 lock_chunks(root);
1500 1574
1501 device->writeable = 1; 1575 device->writeable = 1;
@@ -1511,7 +1585,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1511 device->dev_root = root->fs_info->dev_root; 1585 device->dev_root = root->fs_info->dev_root;
1512 device->bdev = bdev; 1586 device->bdev = bdev;
1513 device->in_fs_metadata = 1; 1587 device->in_fs_metadata = 1;
1514 device->mode = 0; 1588 device->mode = FMODE_EXCL;
1515 set_blocksize(device->bdev, 4096); 1589 set_blocksize(device->bdev, 4096);
1516 1590
1517 if (seeding_dev) { 1591 if (seeding_dev) {
@@ -1763,7 +1837,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1763 return ret; 1837 return ret;
1764 1838
1765 trans = btrfs_start_transaction(root, 0); 1839 trans = btrfs_start_transaction(root, 0);
1766 BUG_ON(!trans); 1840 BUG_ON(IS_ERR(trans));
1767 1841
1768 lock_chunks(root); 1842 lock_chunks(root);
1769 1843
@@ -1794,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1794 1868
1795 BUG_ON(ret); 1869 BUG_ON(ret);
1796 1870
1871 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1872
1797 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1873 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1798 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1874 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1799 BUG_ON(ret); 1875 BUG_ON(ret);
@@ -1916,6 +1992,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1916 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1992 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1917 return -EROFS; 1993 return -EROFS;
1918 1994
1995 if (!capable(CAP_SYS_ADMIN))
1996 return -EPERM;
1997
1919 mutex_lock(&dev_root->fs_info->volume_mutex); 1998 mutex_lock(&dev_root->fs_info->volume_mutex);
1920 dev_root = dev_root->fs_info->dev_root; 1999 dev_root = dev_root->fs_info->dev_root;
1921 2000
@@ -1934,7 +2013,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1934 BUG_ON(ret); 2013 BUG_ON(ret);
1935 2014
1936 trans = btrfs_start_transaction(dev_root, 0); 2015 trans = btrfs_start_transaction(dev_root, 0);
1937 BUG_ON(!trans); 2016 BUG_ON(IS_ERR(trans));
1938 2017
1939 ret = btrfs_grow_device(trans, device, old_size); 2018 ret = btrfs_grow_device(trans, device, old_size);
1940 BUG_ON(ret); 2019 BUG_ON(ret);
@@ -2100,6 +2179,11 @@ again:
2100 2179
2101 /* Shrinking succeeded, else we would be at "done". */ 2180 /* Shrinking succeeded, else we would be at "done". */
2102 trans = btrfs_start_transaction(root, 0); 2181 trans = btrfs_start_transaction(root, 0);
2182 if (IS_ERR(trans)) {
2183 ret = PTR_ERR(trans);
2184 goto done;
2185 }
2186
2103 lock_chunks(root); 2187 lock_chunks(root);
2104 2188
2105 device->disk_total_bytes = new_size; 2189 device->disk_total_bytes = new_size;
@@ -2154,66 +2238,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2154 return calc_size * num_stripes; 2238 return calc_size * num_stripes;
2155} 2239}
2156 2240
2157static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2241/* Used to sort the devices by max_avail(descending sort) */
2158 struct btrfs_root *extent_root, 2242int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2159 struct map_lookup **map_ret,
2160 u64 *num_bytes, u64 *stripe_size,
2161 u64 start, u64 type)
2162{ 2243{
2163 struct btrfs_fs_info *info = extent_root->fs_info; 2244 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2164 struct btrfs_device *device = NULL; 2245 ((struct btrfs_device_info *)dev_info2)->max_avail)
2165 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2246 return -1;
2166 struct list_head *cur; 2247 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2167 struct map_lookup *map = NULL; 2248 ((struct btrfs_device_info *)dev_info2)->max_avail)
2168 struct extent_map_tree *em_tree; 2249 return 1;
2169 struct extent_map *em; 2250 else
2170 struct list_head private_devs; 2251 return 0;
2171 int min_stripe_size = 1 * 1024 * 1024; 2252}
2172 u64 calc_size = 1024 * 1024 * 1024;
2173 u64 max_chunk_size = calc_size;
2174 u64 min_free;
2175 u64 avail;
2176 u64 max_avail = 0;
2177 u64 dev_offset;
2178 int num_stripes = 1;
2179 int min_stripes = 1;
2180 int sub_stripes = 0;
2181 int looped = 0;
2182 int ret;
2183 int index;
2184 int stripe_len = 64 * 1024;
2185 2253
2186 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2254static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2187 (type & BTRFS_BLOCK_GROUP_DUP)) { 2255 int *num_stripes, int *min_stripes,
2188 WARN_ON(1); 2256 int *sub_stripes)
2189 type &= ~BTRFS_BLOCK_GROUP_DUP; 2257{
2190 } 2258 *num_stripes = 1;
2191 if (list_empty(&fs_devices->alloc_list)) 2259 *min_stripes = 1;
2192 return -ENOSPC; 2260 *sub_stripes = 0;
2193 2261
2194 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2262 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2195 num_stripes = fs_devices->rw_devices; 2263 *num_stripes = fs_devices->rw_devices;
2196 min_stripes = 2; 2264 *min_stripes = 2;
2197 } 2265 }
2198 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2266 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2199 num_stripes = 2; 2267 *num_stripes = 2;
2200 min_stripes = 2; 2268 *min_stripes = 2;
2201 } 2269 }
2202 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2270 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2203 if (fs_devices->rw_devices < 2) 2271 if (fs_devices->rw_devices < 2)
2204 return -ENOSPC; 2272 return -ENOSPC;
2205 num_stripes = 2; 2273 *num_stripes = 2;
2206 min_stripes = 2; 2274 *min_stripes = 2;
2207 } 2275 }
2208 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2276 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2209 num_stripes = fs_devices->rw_devices; 2277 *num_stripes = fs_devices->rw_devices;
2210 if (num_stripes < 4) 2278 if (*num_stripes < 4)
2211 return -ENOSPC; 2279 return -ENOSPC;
2212 num_stripes &= ~(u32)1; 2280 *num_stripes &= ~(u32)1;
2213 sub_stripes = 2; 2281 *sub_stripes = 2;
2214 min_stripes = 4; 2282 *min_stripes = 4;
2215 } 2283 }
2216 2284
2285 return 0;
2286}
2287
2288static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2289 u64 proposed_size, u64 type,
2290 int num_stripes, int small_stripe)
2291{
2292 int min_stripe_size = 1 * 1024 * 1024;
2293 u64 calc_size = proposed_size;
2294 u64 max_chunk_size = calc_size;
2295 int ncopies = 1;
2296
2297 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2298 BTRFS_BLOCK_GROUP_DUP |
2299 BTRFS_BLOCK_GROUP_RAID10))
2300 ncopies = 2;
2301
2217 if (type & BTRFS_BLOCK_GROUP_DATA) { 2302 if (type & BTRFS_BLOCK_GROUP_DATA) {
2218 max_chunk_size = 10 * calc_size; 2303 max_chunk_size = 10 * calc_size;
2219 min_stripe_size = 64 * 1024 * 1024; 2304 min_stripe_size = 64 * 1024 * 1024;
@@ -2230,51 +2315,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2230 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2315 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2231 max_chunk_size); 2316 max_chunk_size);
2232 2317
2233again: 2318 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2234 max_avail = 0; 2319 calc_size = max_chunk_size * ncopies;
2235 if (!map || map->num_stripes != num_stripes) {
2236 kfree(map);
2237 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2238 if (!map)
2239 return -ENOMEM;
2240 map->num_stripes = num_stripes;
2241 }
2242
2243 if (calc_size * num_stripes > max_chunk_size) {
2244 calc_size = max_chunk_size;
2245 do_div(calc_size, num_stripes); 2320 do_div(calc_size, num_stripes);
2246 do_div(calc_size, stripe_len); 2321 do_div(calc_size, BTRFS_STRIPE_LEN);
2247 calc_size *= stripe_len; 2322 calc_size *= BTRFS_STRIPE_LEN;
2248 } 2323 }
2249 2324
2250 /* we don't want tiny stripes */ 2325 /* we don't want tiny stripes */
2251 if (!looped) 2326 if (!small_stripe)
2252 calc_size = max_t(u64, min_stripe_size, calc_size); 2327 calc_size = max_t(u64, min_stripe_size, calc_size);
2253 2328
2254 /* 2329 /*
2255 * we're about to do_div by the stripe_len so lets make sure 2330 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2256 * we end up with something bigger than a stripe 2331 * we end up with something bigger than a stripe
2257 */ 2332 */
2258 calc_size = max_t(u64, calc_size, stripe_len * 4); 2333 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2334
2335 do_div(calc_size, BTRFS_STRIPE_LEN);
2336 calc_size *= BTRFS_STRIPE_LEN;
2337
2338 return calc_size;
2339}
2340
2341static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2342 int num_stripes)
2343{
2344 struct map_lookup *new;
2345 size_t len = map_lookup_size(num_stripes);
2346
2347 BUG_ON(map->num_stripes < num_stripes);
2348
2349 if (map->num_stripes == num_stripes)
2350 return map;
2351
2352 new = kmalloc(len, GFP_NOFS);
2353 if (!new) {
2354 /* just change map->num_stripes */
2355 map->num_stripes = num_stripes;
2356 return map;
2357 }
2358
2359 memcpy(new, map, len);
2360 new->num_stripes = num_stripes;
2361 kfree(map);
2362 return new;
2363}
2364
2365/*
2366 * helper to allocate device space from btrfs_device_info, in which we stored
2367 * max free space information of every device. It is used when we can not
2368 * allocate chunks by default size.
2369 *
2370 * By this helper, we can allocate a new chunk as larger as possible.
2371 */
2372static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2373 struct btrfs_fs_devices *fs_devices,
2374 struct btrfs_device_info *devices,
2375 int nr_device, u64 type,
2376 struct map_lookup **map_lookup,
2377 int min_stripes, u64 *stripe_size)
2378{
2379 int i, index, sort_again = 0;
2380 int min_devices = min_stripes;
2381 u64 max_avail, min_free;
2382 struct map_lookup *map = *map_lookup;
2383 int ret;
2384
2385 if (nr_device < min_stripes)
2386 return -ENOSPC;
2387
2388 btrfs_descending_sort_devices(devices, nr_device);
2389
2390 max_avail = devices[0].max_avail;
2391 if (!max_avail)
2392 return -ENOSPC;
2393
2394 for (i = 0; i < nr_device; i++) {
2395 /*
2396 * if dev_offset = 0, it means the free space of this device
2397 * is less than what we need, and we didn't search max avail
2398 * extent on this device, so do it now.
2399 */
2400 if (!devices[i].dev_offset) {
2401 ret = find_free_dev_extent(trans, devices[i].dev,
2402 max_avail,
2403 &devices[i].dev_offset,
2404 &devices[i].max_avail);
2405 if (ret != 0 && ret != -ENOSPC)
2406 return ret;
2407 sort_again = 1;
2408 }
2409 }
2410
2411 /* we update the max avail free extent of each devices, sort again */
2412 if (sort_again)
2413 btrfs_descending_sort_devices(devices, nr_device);
2414
2415 if (type & BTRFS_BLOCK_GROUP_DUP)
2416 min_devices = 1;
2417
2418 if (!devices[min_devices - 1].max_avail)
2419 return -ENOSPC;
2420
2421 max_avail = devices[min_devices - 1].max_avail;
2422 if (type & BTRFS_BLOCK_GROUP_DUP)
2423 do_div(max_avail, 2);
2424
2425 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2426 min_stripes, 1);
2427 if (type & BTRFS_BLOCK_GROUP_DUP)
2428 min_free = max_avail * 2;
2429 else
2430 min_free = max_avail;
2431
2432 if (min_free > devices[min_devices - 1].max_avail)
2433 return -ENOSPC;
2434
2435 map = __shrink_map_lookup_stripes(map, min_stripes);
2436 *stripe_size = max_avail;
2437
2438 index = 0;
2439 for (i = 0; i < min_stripes; i++) {
2440 map->stripes[i].dev = devices[index].dev;
2441 map->stripes[i].physical = devices[index].dev_offset;
2442 if (type & BTRFS_BLOCK_GROUP_DUP) {
2443 i++;
2444 map->stripes[i].dev = devices[index].dev;
2445 map->stripes[i].physical = devices[index].dev_offset +
2446 max_avail;
2447 }
2448 index++;
2449 }
2450 *map_lookup = map;
2451
2452 return 0;
2453}
2454
2455static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2456 struct btrfs_root *extent_root,
2457 struct map_lookup **map_ret,
2458 u64 *num_bytes, u64 *stripe_size,
2459 u64 start, u64 type)
2460{
2461 struct btrfs_fs_info *info = extent_root->fs_info;
2462 struct btrfs_device *device = NULL;
2463 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2464 struct list_head *cur;
2465 struct map_lookup *map;
2466 struct extent_map_tree *em_tree;
2467 struct extent_map *em;
2468 struct btrfs_device_info *devices_info;
2469 struct list_head private_devs;
2470 u64 calc_size = 1024 * 1024 * 1024;
2471 u64 min_free;
2472 u64 avail;
2473 u64 dev_offset;
2474 int num_stripes;
2475 int min_stripes;
2476 int sub_stripes;
2477 int min_devices; /* the min number of devices we need */
2478 int i;
2479 int ret;
2480 int index;
2259 2481
2260 do_div(calc_size, stripe_len); 2482 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2261 calc_size *= stripe_len; 2483 (type & BTRFS_BLOCK_GROUP_DUP)) {
2484 WARN_ON(1);
2485 type &= ~BTRFS_BLOCK_GROUP_DUP;
2486 }
2487 if (list_empty(&fs_devices->alloc_list))
2488 return -ENOSPC;
2489
2490 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2491 &min_stripes, &sub_stripes);
2492 if (ret)
2493 return ret;
2494
2495 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2496 GFP_NOFS);
2497 if (!devices_info)
2498 return -ENOMEM;
2499
2500 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2501 if (!map) {
2502 ret = -ENOMEM;
2503 goto error;
2504 }
2505 map->num_stripes = num_stripes;
2262 2506
2263 cur = fs_devices->alloc_list.next; 2507 cur = fs_devices->alloc_list.next;
2264 index = 0; 2508 index = 0;
2509 i = 0;
2265 2510
2266 if (type & BTRFS_BLOCK_GROUP_DUP) 2511 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2512 num_stripes, 0);
2513
2514 if (type & BTRFS_BLOCK_GROUP_DUP) {
2267 min_free = calc_size * 2; 2515 min_free = calc_size * 2;
2268 else 2516 min_devices = 1;
2517 } else {
2269 min_free = calc_size; 2518 min_free = calc_size;
2270 2519 min_devices = min_stripes;
2271 /* 2520 }
2272 * we add 1MB because we never use the first 1MB of the device, unless
2273 * we've looped, then we are likely allocating the maximum amount of
2274 * space left already
2275 */
2276 if (!looped)
2277 min_free += 1024 * 1024;
2278 2521
2279 INIT_LIST_HEAD(&private_devs); 2522 INIT_LIST_HEAD(&private_devs);
2280 while (index < num_stripes) { 2523 while (index < num_stripes) {
@@ -2287,27 +2530,39 @@ again:
2287 cur = cur->next; 2530 cur = cur->next;
2288 2531
2289 if (device->in_fs_metadata && avail >= min_free) { 2532 if (device->in_fs_metadata && avail >= min_free) {
2290 ret = find_free_dev_extent(trans, device, 2533 ret = find_free_dev_extent(trans, device, min_free,
2291 min_free, &dev_offset, 2534 &devices_info[i].dev_offset,
2292 &max_avail); 2535 &devices_info[i].max_avail);
2293 if (ret == 0) { 2536 if (ret == 0) {
2294 list_move_tail(&device->dev_alloc_list, 2537 list_move_tail(&device->dev_alloc_list,
2295 &private_devs); 2538 &private_devs);
2296 map->stripes[index].dev = device; 2539 map->stripes[index].dev = device;
2297 map->stripes[index].physical = dev_offset; 2540 map->stripes[index].physical =
2541 devices_info[i].dev_offset;
2298 index++; 2542 index++;
2299 if (type & BTRFS_BLOCK_GROUP_DUP) { 2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2300 map->stripes[index].dev = device; 2544 map->stripes[index].dev = device;
2301 map->stripes[index].physical = 2545 map->stripes[index].physical =
2302 dev_offset + calc_size; 2546 devices_info[i].dev_offset +
2547 calc_size;
2303 index++; 2548 index++;
2304 } 2549 }
2305 } 2550 } else if (ret != -ENOSPC)
2306 } else if (device->in_fs_metadata && avail > max_avail) 2551 goto error;
2307 max_avail = avail; 2552
2553 devices_info[i].dev = device;
2554 i++;
2555 } else if (device->in_fs_metadata &&
2556 avail >= BTRFS_STRIPE_LEN) {
2557 devices_info[i].dev = device;
2558 devices_info[i].max_avail = avail;
2559 i++;
2560 }
2561
2308 if (cur == &fs_devices->alloc_list) 2562 if (cur == &fs_devices->alloc_list)
2309 break; 2563 break;
2310 } 2564 }
2565
2311 list_splice(&private_devs, &fs_devices->alloc_list); 2566 list_splice(&private_devs, &fs_devices->alloc_list);
2312 if (index < num_stripes) { 2567 if (index < num_stripes) {
2313 if (index >= min_stripes) { 2568 if (index >= min_stripes) {
@@ -2316,34 +2571,38 @@ again:
2316 num_stripes /= sub_stripes; 2571 num_stripes /= sub_stripes;
2317 num_stripes *= sub_stripes; 2572 num_stripes *= sub_stripes;
2318 } 2573 }
2319 looped = 1; 2574
2320 goto again; 2575 map = __shrink_map_lookup_stripes(map, num_stripes);
2321 } 2576 } else if (i >= min_devices) {
2322 if (!looped && max_avail > 0) { 2577 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2323 looped = 1; 2578 devices_info, i, type,
2324 calc_size = max_avail; 2579 &map, min_stripes,
2325 goto again; 2580 &calc_size);
2581 if (ret)
2582 goto error;
2583 } else {
2584 ret = -ENOSPC;
2585 goto error;
2326 } 2586 }
2327 kfree(map);
2328 return -ENOSPC;
2329 } 2587 }
2330 map->sector_size = extent_root->sectorsize; 2588 map->sector_size = extent_root->sectorsize;
2331 map->stripe_len = stripe_len; 2589 map->stripe_len = BTRFS_STRIPE_LEN;
2332 map->io_align = stripe_len; 2590 map->io_align = BTRFS_STRIPE_LEN;
2333 map->io_width = stripe_len; 2591 map->io_width = BTRFS_STRIPE_LEN;
2334 map->type = type; 2592 map->type = type;
2335 map->num_stripes = num_stripes;
2336 map->sub_stripes = sub_stripes; 2593 map->sub_stripes = sub_stripes;
2337 2594
2338 *map_ret = map; 2595 *map_ret = map;
2339 *stripe_size = calc_size; 2596 *stripe_size = calc_size;
2340 *num_bytes = chunk_bytes_by_type(type, calc_size, 2597 *num_bytes = chunk_bytes_by_type(type, calc_size,
2341 num_stripes, sub_stripes); 2598 map->num_stripes, sub_stripes);
2599
2600 trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
2342 2601
2343 em = alloc_extent_map(GFP_NOFS); 2602 em = alloc_extent_map(GFP_NOFS);
2344 if (!em) { 2603 if (!em) {
2345 kfree(map); 2604 ret = -ENOMEM;
2346 return -ENOMEM; 2605 goto error;
2347 } 2606 }
2348 em->bdev = (struct block_device *)map; 2607 em->bdev = (struct block_device *)map;
2349 em->start = start; 2608 em->start = start;
@@ -2376,7 +2635,13 @@ again:
2376 index++; 2635 index++;
2377 } 2636 }
2378 2637
2638 kfree(devices_info);
2379 return 0; 2639 return 0;
2640
2641error:
2642 kfree(map);
2643 kfree(devices_info);
2644 return ret;
2380} 2645}
2381 2646
2382static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2647static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2442,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2442 item_size); 2707 item_size);
2443 BUG_ON(ret); 2708 BUG_ON(ret);
2444 } 2709 }
2710
2445 kfree(chunk); 2711 kfree(chunk);
2446 return 0; 2712 return 0;
2447} 2713}
@@ -2639,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2639static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2905static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2640 u64 logical, u64 *length, 2906 u64 logical, u64 *length,
2641 struct btrfs_multi_bio **multi_ret, 2907 struct btrfs_multi_bio **multi_ret,
2642 int mirror_num, struct page *unplug_page) 2908 int mirror_num)
2643{ 2909{
2644 struct extent_map *em; 2910 struct extent_map *em;
2645 struct map_lookup *map; 2911 struct map_lookup *map;
2646 struct extent_map_tree *em_tree = &map_tree->map_tree; 2912 struct extent_map_tree *em_tree = &map_tree->map_tree;
2647 u64 offset; 2913 u64 offset;
2648 u64 stripe_offset; 2914 u64 stripe_offset;
2915 u64 stripe_end_offset;
2649 u64 stripe_nr; 2916 u64 stripe_nr;
2917 u64 stripe_nr_orig;
2918 u64 stripe_nr_end;
2650 int stripes_allocated = 8; 2919 int stripes_allocated = 8;
2651 int stripes_required = 1; 2920 int stripes_required = 1;
2652 int stripe_index; 2921 int stripe_index;
@@ -2655,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2655 int max_errors = 0; 2924 int max_errors = 0;
2656 struct btrfs_multi_bio *multi = NULL; 2925 struct btrfs_multi_bio *multi = NULL;
2657 2926
2658 if (multi_ret && !(rw & REQ_WRITE)) 2927 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2659 stripes_allocated = 1; 2928 stripes_allocated = 1;
2660again: 2929again:
2661 if (multi_ret) { 2930 if (multi_ret) {
@@ -2671,11 +2940,6 @@ again:
2671 em = lookup_extent_mapping(em_tree, logical, *length); 2940 em = lookup_extent_mapping(em_tree, logical, *length);
2672 read_unlock(&em_tree->lock); 2941 read_unlock(&em_tree->lock);
2673 2942
2674 if (!em && unplug_page) {
2675 kfree(multi);
2676 return 0;
2677 }
2678
2679 if (!em) { 2943 if (!em) {
2680 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2944 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2681 (unsigned long long)logical, 2945 (unsigned long long)logical,
@@ -2701,7 +2965,15 @@ again:
2701 max_errors = 1; 2965 max_errors = 1;
2702 } 2966 }
2703 } 2967 }
2704 if (multi_ret && (rw & REQ_WRITE) && 2968 if (rw & REQ_DISCARD) {
2969 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2970 BTRFS_BLOCK_GROUP_RAID1 |
2971 BTRFS_BLOCK_GROUP_DUP |
2972 BTRFS_BLOCK_GROUP_RAID10)) {
2973 stripes_required = map->num_stripes;
2974 }
2975 }
2976 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2705 stripes_allocated < stripes_required) { 2977 stripes_allocated < stripes_required) {
2706 stripes_allocated = map->num_stripes; 2978 stripes_allocated = map->num_stripes;
2707 free_extent_map(em); 2979 free_extent_map(em);
@@ -2721,23 +2993,37 @@ again:
2721 /* stripe_offset is the offset of this block in its stripe*/ 2993 /* stripe_offset is the offset of this block in its stripe*/
2722 stripe_offset = offset - stripe_offset; 2994 stripe_offset = offset - stripe_offset;
2723 2995
2724 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2996 if (rw & REQ_DISCARD)
2725 BTRFS_BLOCK_GROUP_RAID10 | 2997 *length = min_t(u64, em->len - offset, *length);
2726 BTRFS_BLOCK_GROUP_DUP)) { 2998 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2999 BTRFS_BLOCK_GROUP_RAID1 |
3000 BTRFS_BLOCK_GROUP_RAID10 |
3001 BTRFS_BLOCK_GROUP_DUP)) {
2727 /* we limit the length of each bio to what fits in a stripe */ 3002 /* we limit the length of each bio to what fits in a stripe */
2728 *length = min_t(u64, em->len - offset, 3003 *length = min_t(u64, em->len - offset,
2729 map->stripe_len - stripe_offset); 3004 map->stripe_len - stripe_offset);
2730 } else { 3005 } else {
2731 *length = em->len - offset; 3006 *length = em->len - offset;
2732 } 3007 }
2733 3008
2734 if (!multi_ret && !unplug_page) 3009 if (!multi_ret)
2735 goto out; 3010 goto out;
2736 3011
2737 num_stripes = 1; 3012 num_stripes = 1;
2738 stripe_index = 0; 3013 stripe_index = 0;
2739 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3014 stripe_nr_orig = stripe_nr;
2740 if (unplug_page || (rw & REQ_WRITE)) 3015 stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3016 (~(map->stripe_len - 1));
3017 do_div(stripe_nr_end, map->stripe_len);
3018 stripe_end_offset = stripe_nr_end * map->stripe_len -
3019 (offset + *length);
3020 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3021 if (rw & REQ_DISCARD)
3022 num_stripes = min_t(u64, map->num_stripes,
3023 stripe_nr_end - stripe_nr_orig);
3024 stripe_index = do_div(stripe_nr, map->num_stripes);
3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3026 if (rw & (REQ_WRITE | REQ_DISCARD))
2741 num_stripes = map->num_stripes; 3027 num_stripes = map->num_stripes;
2742 else if (mirror_num) 3028 else if (mirror_num)
2743 stripe_index = mirror_num - 1; 3029 stripe_index = mirror_num - 1;
@@ -2748,7 +3034,7 @@ again:
2748 } 3034 }
2749 3035
2750 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3036 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2751 if (rw & REQ_WRITE) 3037 if (rw & (REQ_WRITE | REQ_DISCARD))
2752 num_stripes = map->num_stripes; 3038 num_stripes = map->num_stripes;
2753 else if (mirror_num) 3039 else if (mirror_num)
2754 stripe_index = mirror_num - 1; 3040 stripe_index = mirror_num - 1;
@@ -2759,8 +3045,12 @@ again:
2759 stripe_index = do_div(stripe_nr, factor); 3045 stripe_index = do_div(stripe_nr, factor);
2760 stripe_index *= map->sub_stripes; 3046 stripe_index *= map->sub_stripes;
2761 3047
2762 if (unplug_page || (rw & REQ_WRITE)) 3048 if (rw & REQ_WRITE)
2763 num_stripes = map->sub_stripes; 3049 num_stripes = map->sub_stripes;
3050 else if (rw & REQ_DISCARD)
3051 num_stripes = min_t(u64, map->sub_stripes *
3052 (stripe_nr_end - stripe_nr_orig),
3053 map->num_stripes);
2764 else if (mirror_num) 3054 else if (mirror_num)
2765 stripe_index += mirror_num - 1; 3055 stripe_index += mirror_num - 1;
2766 else { 3056 else {
@@ -2778,24 +3068,101 @@ again:
2778 } 3068 }
2779 BUG_ON(stripe_index >= map->num_stripes); 3069 BUG_ON(stripe_index >= map->num_stripes);
2780 3070
2781 for (i = 0; i < num_stripes; i++) { 3071 if (rw & REQ_DISCARD) {
2782 if (unplug_page) { 3072 for (i = 0; i < num_stripes; i++) {
2783 struct btrfs_device *device;
2784 struct backing_dev_info *bdi;
2785
2786 device = map->stripes[stripe_index].dev;
2787 if (device->bdev) {
2788 bdi = blk_get_backing_dev_info(device->bdev);
2789 if (bdi->unplug_io_fn)
2790 bdi->unplug_io_fn(bdi, unplug_page);
2791 }
2792 } else {
2793 multi->stripes[i].physical = 3073 multi->stripes[i].physical =
2794 map->stripes[stripe_index].physical + 3074 map->stripes[stripe_index].physical +
2795 stripe_offset + stripe_nr * map->stripe_len; 3075 stripe_offset + stripe_nr * map->stripe_len;
2796 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3076 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3077
3078 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3079 u64 stripes;
3080 u32 last_stripe = 0;
3081 int j;
3082
3083 div_u64_rem(stripe_nr_end - 1,
3084 map->num_stripes,
3085 &last_stripe);
3086
3087 for (j = 0; j < map->num_stripes; j++) {
3088 u32 test;
3089
3090 div_u64_rem(stripe_nr_end - 1 - j,
3091 map->num_stripes, &test);
3092 if (test == stripe_index)
3093 break;
3094 }
3095 stripes = stripe_nr_end - 1 - j;
3096 do_div(stripes, map->num_stripes);
3097 multi->stripes[i].length = map->stripe_len *
3098 (stripes - stripe_nr + 1);
3099
3100 if (i == 0) {
3101 multi->stripes[i].length -=
3102 stripe_offset;
3103 stripe_offset = 0;
3104 }
3105 if (stripe_index == last_stripe)
3106 multi->stripes[i].length -=
3107 stripe_end_offset;
3108 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3109 u64 stripes;
3110 int j;
3111 int factor = map->num_stripes /
3112 map->sub_stripes;
3113 u32 last_stripe = 0;
3114
3115 div_u64_rem(stripe_nr_end - 1,
3116 factor, &last_stripe);
3117 last_stripe *= map->sub_stripes;
3118
3119 for (j = 0; j < factor; j++) {
3120 u32 test;
3121
3122 div_u64_rem(stripe_nr_end - 1 - j,
3123 factor, &test);
3124
3125 if (test ==
3126 stripe_index / map->sub_stripes)
3127 break;
3128 }
3129 stripes = stripe_nr_end - 1 - j;
3130 do_div(stripes, factor);
3131 multi->stripes[i].length = map->stripe_len *
3132 (stripes - stripe_nr + 1);
3133
3134 if (i < map->sub_stripes) {
3135 multi->stripes[i].length -=
3136 stripe_offset;
3137 if (i == map->sub_stripes - 1)
3138 stripe_offset = 0;
3139 }
3140 if (stripe_index >= last_stripe &&
3141 stripe_index <= (last_stripe +
3142 map->sub_stripes - 1)) {
3143 multi->stripes[i].length -=
3144 stripe_end_offset;
3145 }
3146 } else
3147 multi->stripes[i].length = *length;
3148
3149 stripe_index++;
3150 if (stripe_index == map->num_stripes) {
3151 /* This could only happen for RAID0/10 */
3152 stripe_index = 0;
3153 stripe_nr++;
3154 }
3155 }
3156 } else {
3157 for (i = 0; i < num_stripes; i++) {
3158 multi->stripes[i].physical =
3159 map->stripes[stripe_index].physical +
3160 stripe_offset +
3161 stripe_nr * map->stripe_len;
3162 multi->stripes[i].dev =
3163 map->stripes[stripe_index].dev;
3164 stripe_index++;
2797 } 3165 }
2798 stripe_index++;
2799 } 3166 }
2800 if (multi_ret) { 3167 if (multi_ret) {
2801 *multi_ret = multi; 3168 *multi_ret = multi;
@@ -2812,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2812 struct btrfs_multi_bio **multi_ret, int mirror_num) 3179 struct btrfs_multi_bio **multi_ret, int mirror_num)
2813{ 3180{
2814 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3181 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2815 mirror_num, NULL); 3182 mirror_num);
2816} 3183}
2817 3184
2818int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3185int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -2880,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2880 return 0; 3247 return 0;
2881} 3248}
2882 3249
2883int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2884 u64 logical, struct page *page)
2885{
2886 u64 length = PAGE_CACHE_SIZE;
2887 return __btrfs_map_block(map_tree, READ, logical, &length,
2888 NULL, 0, page);
2889}
2890
2891static void end_bio_multi_stripe(struct bio *bio, int err) 3250static void end_bio_multi_stripe(struct bio *bio, int err)
2892{ 3251{
2893 struct btrfs_multi_bio *multi = bio->bi_private; 3252 struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1be781079450..cc2eadaf7a27 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -123,6 +126,7 @@ struct btrfs_fs_devices {
123struct btrfs_bio_stripe { 126struct btrfs_bio_stripe {
124 struct btrfs_device *dev; 127 struct btrfs_device *dev;
125 u64 physical; 128 u64 physical;
129 u64 length; /* only used for discard mappings */
126}; 130};
127 131
128struct btrfs_multi_bio { 132struct btrfs_multi_bio {
@@ -136,6 +140,41 @@ struct btrfs_multi_bio {
136 struct btrfs_bio_stripe stripes[]; 140 struct btrfs_bio_stripe stripes[];
137}; 141};
138 142
143struct btrfs_device_info {
144 struct btrfs_device *dev;
145 u64 dev_offset;
146 u64 max_avail;
147};
148
149struct map_lookup {
150 u64 type;
151 int io_align;
152 int io_width;
153 int stripe_len;
154 int sector_size;
155 int num_stripes;
156 int sub_stripes;
157 struct btrfs_bio_stripe stripes[];
158};
159
160/* Used to sort the devices by max_avail(descending sort) */
161int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
162
163/*
164 * sort the devices by max_avail, in which max free extent size of each device
165 * is stored.(Descending Sort)
166 */
167static inline void btrfs_descending_sort_devices(
168 struct btrfs_device_info *devices,
169 size_t nr_devices)
170{
171 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
172 btrfs_cmp_device_free_bytes, NULL);
173}
174
175int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
176 u64 end, u64 *length);
177
139#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 178#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
140 (sizeof(struct btrfs_bio_stripe) * (n))) 179 (sizeof(struct btrfs_bio_stripe) * (n)))
141 180
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
180 struct btrfs_path *path; 180 struct btrfs_path *path;
181 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
182 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
183 int ret = 0, slot, advance; 183 int ret = 0, slot;
184 size_t total_size = 0, size_left = size; 184 size_t total_size = 0, size_left = size;
185 unsigned long name_ptr; 185 unsigned long name_ptr;
186 size_t name_len; 186 size_t name_len;
187 u32 nritems;
188 187
189 /* 188 /*
190 * ok we want all objects associated with this id. 189 * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
204 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
205 if (ret < 0) 204 if (ret < 0)
206 goto err; 205 goto err;
207 advance = 0; 206
208 while (1) { 207 while (1) {
209 leaf = path->nodes[0]; 208 leaf = path->nodes[0];
210 nritems = btrfs_header_nritems(leaf);
211 slot = path->slots[0]; 209 slot = path->slots[0];
212 210
213 /* this is where we start walking through the path */ 211 /* this is where we start walking through the path */
214 if (advance || slot >= nritems) { 212 if (slot >= btrfs_header_nritems(leaf)) {
215 /* 213 /*
216 * if we've reached the last slot in this leaf we need 214 * if we've reached the last slot in this leaf we need
217 * to go to the next leaf and reset everything 215 * to go to the next leaf and reset everything
218 */ 216 */
219 if (slot >= nritems-1) { 217 ret = btrfs_next_leaf(root, path);
220 ret = btrfs_next_leaf(root, path); 218 if (ret < 0)
221 if (ret) 219 goto err;
222 break; 220 else if (ret > 0)
223 leaf = path->nodes[0]; 221 break;
224 nritems = btrfs_header_nritems(leaf); 222 continue;
225 slot = path->slots[0];
226 } else {
227 /*
228 * just walking through the slots on this leaf
229 */
230 slot++;
231 path->slots[0]++;
232 }
233 } 223 }
234 advance = 1;
235 224
236 btrfs_item_key_to_cpu(leaf, &found_key, slot); 225 btrfs_item_key_to_cpu(leaf, &found_key, slot);
237 226
@@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
242 break; 231 break;
243 232
244 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 233 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
234 if (verify_dir_item(root, leaf, di))
235 continue;
245 236
246 name_len = btrfs_dir_name_len(leaf, di); 237 name_len = btrfs_dir_name_len(leaf, di);
247 total_size += name_len + 1; 238 total_size += name_len + 1;
248 239
249 /* we are just looking for how big our buffer needs to be */ 240 /* we are just looking for how big our buffer needs to be */
250 if (!size) 241 if (!size)
251 continue; 242 goto next;
252 243
253 if (!buffer || (name_len + 1) > size_left) { 244 if (!buffer || (name_len + 1) > size_left) {
254 ret = -ERANGE; 245 ret = -ERANGE;
@@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
261 252
262 size_left -= name_len + 1; 253 size_left -= name_len + 1;
263 buffer += name_len + 1; 254 buffer += name_len + 1;
255next:
256 path->slots[0]++;
264 } 257 }
265 ret = total_size; 258 ret = total_size;
266 259
@@ -316,6 +309,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 309int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 310 size_t size, int flags)
318{ 311{
312 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
313
314 /*
315 * The permission on security.* and system.* is not checked
316 * in permission().
317 */
318 if (btrfs_root_readonly(root))
319 return -EROFS;
320
319 /* 321 /*
320 * If this is a request for a synthetic attribute in the system.* 322 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 323 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +338,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 338
337int btrfs_removexattr(struct dentry *dentry, const char *name) 339int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 340{
341 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
342
343 /*
344 * The permission on security.* and system.* is not checked
345 * in permission().
346 */
347 if (btrfs_root_readonly(root))
348 return -EROFS;
349
339 /* 350 /*
340 * If this is a request for a synthetic attribute in the system.* 351 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 352 * namespace use the generic infrastructure to resolve a handler
@@ -352,7 +363,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
352} 363}
353 364
354int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 365int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
355 struct inode *inode, struct inode *dir) 366 struct inode *inode, struct inode *dir,
367 const struct qstr *qstr)
356{ 368{
357 int err; 369 int err;
358 size_t len; 370 size_t len;
@@ -360,7 +372,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
360 char *suffix; 372 char *suffix;
361 char *name; 373 char *name;
362 374
363 err = security_inode_init_security(inode, dir, &suffix, &value, &len); 375 err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
376 &len);
364 if (err) { 377 if (err) {
365 if (err == -EOPNOTSUPP) 378 if (err == -EOPNOTSUPP)
366 return 0; 379 return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
40 struct inode *inode, struct inode *dir); 40 struct inode *inode, struct inode *dir,
41 const struct qstr *qstr);
41 42
42#endif /* __XATTR__ */ 43#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,52 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{ 43{
63 struct workspace *workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110 45
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace); 46 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace); 47 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf); 48 kfree(workspace->buf);
141 kfree(workspace); 49 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147} 50}
148 51
149/* 52static struct list_head *zlib_alloc_workspace(void)
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{ 53{
154 struct workspace *workspace; 54 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) { 55
156 workspace = list_entry(idle_workspace.next, struct workspace, 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
157 list); 57 if (!workspace)
158 list_del(&workspace->list); 58 return ERR_PTR(-ENOMEM);
159 vfree(workspace->def_strm.workspace); 59
160 vfree(workspace->inf_strm.workspace); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
161 kfree(workspace->buf); 61 MAX_WBITS, MAX_MEM_LEVEL));
162 kfree(workspace); 62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
163 atomic_dec(&alloc_workspace); 63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
164 } 64 if (!workspace->def_strm.workspace ||
65 !workspace->inf_strm.workspace || !workspace->buf)
66 goto fail;
67
68 INIT_LIST_HEAD(&workspace->list);
69
70 return &workspace->list;
71fail:
72 zlib_free_workspace(&workspace->list);
73 return ERR_PTR(-ENOMEM);
165} 74}
166 75
167/* 76static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 77 struct address_space *mapping,
169 * 78 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 79 struct page **pages,
171 * in 'pages' 80 unsigned long nr_dest_pages,
172 * 81 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 82 unsigned long *total_in,
174 * may be pages allocated even if we return an error 83 unsigned long *total_out,
175 * 84 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 85{
86 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 87 int ret;
196 struct workspace *workspace;
197 char *data_in; 88 char *data_in;
198 char *cpage_out; 89 char *cpage_out;
199 int nr_pages = 0; 90 int nr_pages = 0;
@@ -205,10 +96,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 96 *total_out = 0;
206 *total_in = 0; 97 *total_in = 0;
207 98
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 101 ret = -1;
@@ -222,6 +109,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 109 data_in = kmap(in_page);
223 110
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
112 if (out_page == NULL) {
113 ret = -1;
114 goto out;
115 }
225 cpage_out = kmap(out_page); 116 cpage_out = kmap(out_page);
226 pages[0] = out_page; 117 pages[0] = out_page;
227 nr_pages = 1; 118 nr_pages = 1;
@@ -260,6 +151,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 151 goto out;
261 } 152 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
154 if (out_page == NULL) {
155 ret = -1;
156 goto out;
157 }
263 cpage_out = kmap(out_page); 158 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 159 pages[nr_pages] = out_page;
265 nr_pages++; 160 nr_pages++;
@@ -314,55 +209,26 @@ out:
314 kunmap(in_page); 209 kunmap(in_page);
315 page_cache_release(in_page); 210 page_cache_release(in_page);
316 } 211 }
317 free_workspace(workspace);
318 return ret; 212 return ret;
319} 213}
320 214
321/* 215static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 216 u64 disk_start,
323 * 217 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 218 int vcnt,
325 * 219 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 220{
343 int ret = 0; 221 struct workspace *workspace = list_entry(ws, struct workspace, list);
222 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 223 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 224 char *data_in;
347 size_t total_out = 0; 225 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 226 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 227 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 228 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 229 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 230 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 231 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 232
367 data_in = kmap(pages_in[page_in_index]); 233 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 234 workspace->inf_strm.next_in = data_in;
@@ -372,8 +238,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 238 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 239 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 240 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 241 pg_offset = 0;
378 242
379 /* If it's deflate, and it's got no preset dictionary, then 243 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +253,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 253
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 256 return -1;
393 goto out;
394 } 257 }
395 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 260 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 261 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 262
405 /* total_out is the last byte of the workspace buffer */ 263 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 264 total_out = workspace->inf_strm.total_out;
407 265
408 working_bytes = total_out - buf_start; 266 /* we didn't make progress in this inflate call, we're done */
409 267 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 268 break;
423 }
424 269
425 /* we haven't yet hit data corresponding to this page */ 270 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 271 total_out, disk_start,
427 goto next; 272 bvec, vcnt,
428 273 &page_out_index, &pg_offset);
429 /* 274 if (ret2 == 0) {
430 * the start of the data we care about is offset into 275 ret = 0;
431 * the middle of our working buffer 276 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 277 }
492next: 278
493 workspace->inf_strm.next_out = workspace->buf; 279 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 280 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 281
@@ -516,35 +302,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 302 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 303 if (data_in)
518 kunmap(pages_in[page_in_index]); 304 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 305 return ret;
522} 306}
523 307
524/* 308static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 309 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 310 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 311 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 312{
313 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 314 int ret = 0;
535 int wbits = MAX_WBITS; 315 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 316 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 317 unsigned long total_out = 0;
539 char *kaddr; 318 char *kaddr;
540 319
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 320 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 321 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 322 workspace->inf_strm.total_in = 0;
@@ -565,8 +337,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 337
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 340 return -1;
569 goto out;
570 } 341 }
571 342
572 while (bytes_left > 0) { 343 while (bytes_left > 0) {
@@ -616,12 +387,13 @@ next:
616 ret = 0; 387 ret = 0;
617 388
618 zlib_inflateEnd(&workspace->inf_strm); 389 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 390 return ret;
622} 391}
623 392
624void btrfs_zlib_exit(void) 393struct btrfs_compress_op btrfs_zlib_compress = {
625{ 394 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 395 .free_workspace = zlib_free_workspace,
627} 396 .compress_pages = zlib_compress_pages,
397 .decompress_biovec = zlib_decompress_biovec,
398 .decompress = zlib_decompress,
399};