aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 02:38:50 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 02:38:50 -0400
commit97eb3f24352ec6632c2127b35d8087d2a809a9b9 (patch)
tree722948059bbd325bbca232269490124231df80d4 /fs/btrfs
parent439581ec07fa9cf3f519dd461a2cf41cfd3adcb4 (diff)
parentdef179c271ac9b5020deca798470521f14d11edd (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c25
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c344
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h57
-rw-r--r--fs/btrfs/disk-io.c453
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c88
-rw-r--r--fs/btrfs/extent-tree.c167
-rw-r--r--fs/btrfs/extent_io.c84
-rw-r--r--fs/btrfs/extent_io.h20
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c225
-rw-r--r--fs/btrfs/free-space-cache.c12
-rw-r--r--fs/btrfs/inode.c505
-rw-r--r--fs/btrfs/ioctl.c245
-rw-r--r--fs/btrfs/ioctl.h26
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c85
-rw-r--r--fs/btrfs/ordered-data.h11
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/super.c325
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c674
-rw-r--r--fs/btrfs/volumes.h31
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
33 files changed, 3386 insertions, 934 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 63 if (IS_ERR(acl)) {
64 kfree(value);
64 return acl; 65 return acl;
66 }
65 set_cached_acl(inode, type, acl); 67 set_cached_acl(inode, type, acl);
66 } 68 }
67 kfree(value); 69 kfree(value);
@@ -185,18 +187,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
185 return ret; 187 return ret;
186} 188}
187 189
188int btrfs_check_acl(struct inode *inode, int mask) 190int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
189{ 191{
190 struct posix_acl *acl;
191 int error = -EAGAIN; 192 int error = -EAGAIN;
192 193
193 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 194 if (flags & IPERM_FLAG_RCU) {
195 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
196 error = -ECHILD;
194 197
195 if (IS_ERR(acl)) 198 } else {
196 return PTR_ERR(acl); 199 struct posix_acl *acl;
197 if (acl) { 200 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
198 error = posix_acl_permission(inode, acl, mask); 201 if (IS_ERR(acl))
199 posix_acl_release(acl); 202 return PTR_ERR(acl);
203 if (acl) {
204 error = posix_acl_permission(inode, acl, mask);
205 posix_acl_release(acl);
206 }
200 } 207 }
201 208
202 return error; 209 return error;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
157 /* 157 /*
158 * always compress this one file 158 * always compress this one file
159 */ 159 */
160 unsigned force_compress:1; 160 unsigned force_compress:4;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d9..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91static struct bio *compressed_bio_alloc(struct block_device *bdev, 94static struct bio *compressed_bio_alloc(struct block_device *bdev,
92 u64 first_byte, gfp_t gfp_flags) 95 u64 first_byte, gfp_t gfp_flags)
93{ 96{
94 struct bio *bio;
95 int nr_vecs; 97 int nr_vecs;
96 98
97 nr_vecs = bio_get_nr_vecs(bdev); 99 nr_vecs = bio_get_nr_vecs(bdev);
98 bio = bio_alloc(gfp_flags, nr_vecs); 100 return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
99
100 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
101 while (!bio && (nr_vecs /= 2))
102 bio = bio_alloc(gfp_flags, nr_vecs);
103 }
104
105 if (bio) {
106 bio->bi_size = 0;
107 bio->bi_bdev = bdev;
108 bio->bi_sector = first_byte >> 9;
109 }
110 return bio;
111} 101}
112 102
113static int check_compressed_csum(struct inode *inode, 103static int check_compressed_csum(struct inode *inode,
@@ -186,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
186 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
187 * the decompression. 177 * the decompression.
188 */ 178 */
189 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
190 cb->start, 180 cb->compressed_pages,
191 cb->orig_bio->bi_io_vec, 181 cb->start,
192 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
193 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
194csum_failed: 185csum_failed:
195 if (ret) 186 if (ret)
196 cb->errors = 1; 187 cb->errors = 1;
@@ -601,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
601 592
602 cb->len = uncompressed_len; 593 cb->len = uncompressed_len;
603 cb->compressed_len = compressed_len; 594 cb->compressed_len = compressed_len;
595 cb->compress_type = extent_compress_type(bio_flags);
604 cb->orig_bio = bio; 596 cb->orig_bio = bio;
605 597
606 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -690,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
690 bio_put(comp_bio); 682 bio_put(comp_bio);
691 return 0; 683 return 0;
692} 684}
685
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
687static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
688static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
689static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
690static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
691
692struct btrfs_compress_op *btrfs_compress_op[] = {
693 &btrfs_zlib_compress,
694 &btrfs_lzo_compress,
695};
696
697int __init btrfs_init_compress(void)
698{
699 int i;
700
701 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
702 INIT_LIST_HEAD(&comp_idle_workspace[i]);
703 spin_lock_init(&comp_workspace_lock[i]);
704 atomic_set(&comp_alloc_workspace[i], 0);
705 init_waitqueue_head(&comp_workspace_wait[i]);
706 }
707 return 0;
708}
709
710/*
711 * this finds an available workspace or allocates a new one
712 * ERR_PTR is returned if things go bad.
713 */
714static struct list_head *find_workspace(int type)
715{
716 struct list_head *workspace;
717 int cpus = num_online_cpus();
718 int idx = type - 1;
719
720 struct list_head *idle_workspace = &comp_idle_workspace[idx];
721 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
722 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
723 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
724 int *num_workspace = &comp_num_workspace[idx];
725again:
726 spin_lock(workspace_lock);
727 if (!list_empty(idle_workspace)) {
728 workspace = idle_workspace->next;
729 list_del(workspace);
730 (*num_workspace)--;
731 spin_unlock(workspace_lock);
732 return workspace;
733
734 }
735 if (atomic_read(alloc_workspace) > cpus) {
736 DEFINE_WAIT(wait);
737
738 spin_unlock(workspace_lock);
739 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
740 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
741 schedule();
742 finish_wait(workspace_wait, &wait);
743 goto again;
744 }
745 atomic_inc(alloc_workspace);
746 spin_unlock(workspace_lock);
747
748 workspace = btrfs_compress_op[idx]->alloc_workspace();
749 if (IS_ERR(workspace)) {
750 atomic_dec(alloc_workspace);
751 wake_up(workspace_wait);
752 }
753 return workspace;
754}
755
756/*
757 * put a workspace struct back on the list or free it if we have enough
758 * idle ones sitting around
759 */
760static void free_workspace(int type, struct list_head *workspace)
761{
762 int idx = type - 1;
763 struct list_head *idle_workspace = &comp_idle_workspace[idx];
764 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
765 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
766 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
767 int *num_workspace = &comp_num_workspace[idx];
768
769 spin_lock(workspace_lock);
770 if (*num_workspace < num_online_cpus()) {
771 list_add_tail(workspace, idle_workspace);
772 (*num_workspace)++;
773 spin_unlock(workspace_lock);
774 goto wake;
775 }
776 spin_unlock(workspace_lock);
777
778 btrfs_compress_op[idx]->free_workspace(workspace);
779 atomic_dec(alloc_workspace);
780wake:
781 if (waitqueue_active(workspace_wait))
782 wake_up(workspace_wait);
783}
784
785/*
786 * cleanup function for module exit
787 */
788static void free_workspaces(void)
789{
790 struct list_head *workspace;
791 int i;
792
793 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
794 while (!list_empty(&comp_idle_workspace[i])) {
795 workspace = comp_idle_workspace[i].next;
796 list_del(workspace);
797 btrfs_compress_op[i]->free_workspace(workspace);
798 atomic_dec(&comp_alloc_workspace[i]);
799 }
800 }
801}
802
803/*
804 * given an address space and start/len, compress the bytes.
805 *
806 * pages are allocated to hold the compressed result and stored
807 * in 'pages'
808 *
809 * out_pages is used to return the number of pages allocated. There
810 * may be pages allocated even if we return an error
811 *
812 * total_in is used to return the number of bytes actually read. It
813 * may be smaller then len if we had to exit early because we
814 * ran out of room in the pages array or because we cross the
815 * max_out threshold.
816 *
817 * total_out is used to return the total number of compressed bytes
818 *
819 * max_out tells us the max number of bytes that we're allowed to
820 * stuff into pages
821 */
822int btrfs_compress_pages(int type, struct address_space *mapping,
823 u64 start, unsigned long len,
824 struct page **pages,
825 unsigned long nr_dest_pages,
826 unsigned long *out_pages,
827 unsigned long *total_in,
828 unsigned long *total_out,
829 unsigned long max_out)
830{
831 struct list_head *workspace;
832 int ret;
833
834 workspace = find_workspace(type);
835 if (IS_ERR(workspace))
836 return -1;
837
838 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
839 start, len, pages,
840 nr_dest_pages, out_pages,
841 total_in, total_out,
842 max_out);
843 free_workspace(type, workspace);
844 return ret;
845}
846
847/*
848 * pages_in is an array of pages with compressed data.
849 *
850 * disk_start is the starting logical offset of this array in the file
851 *
852 * bvec is a bio_vec of pages from the file that we want to decompress into
853 *
854 * vcnt is the count of pages in the biovec
855 *
856 * srclen is the number of bytes in pages_in
857 *
858 * The basic idea is that we have a bio that was created by readpages.
859 * The pages in the bio are for the uncompressed data, and they may not
860 * be contiguous. They all correspond to the range of bytes covered by
861 * the compressed extent.
862 */
863int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
864 struct bio_vec *bvec, int vcnt, size_t srclen)
865{
866 struct list_head *workspace;
867 int ret;
868
869 workspace = find_workspace(type);
870 if (IS_ERR(workspace))
871 return -ENOMEM;
872
873 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
874 disk_start,
875 bvec, vcnt, srclen);
876 free_workspace(type, workspace);
877 return ret;
878}
879
880/*
881 * a less complex decompression routine. Our compressed data fits in a
882 * single page, and we want to read a single page out of it.
883 * start_byte tells us the offset into the compressed data we're interested in
884 */
885int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
886 unsigned long start_byte, size_t srclen, size_t destlen)
887{
888 struct list_head *workspace;
889 int ret;
890
891 workspace = find_workspace(type);
892 if (IS_ERR(workspace))
893 return -ENOMEM;
894
895 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
896 dest_page, start_byte,
897 srclen, destlen);
898
899 free_workspace(type, workspace);
900 return ret;
901}
902
903void __exit btrfs_exit_compress(void)
904{
905 free_workspaces();
906}
907
908/*
909 * Copy uncompressed data from working buffer to pages.
910 *
911 * buf_start is the byte offset we're of the start of our workspace buffer.
912 *
913 * total_out is the last byte of the buffer
914 */
915int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
916 unsigned long total_out, u64 disk_start,
917 struct bio_vec *bvec, int vcnt,
918 unsigned long *page_index,
919 unsigned long *pg_offset)
920{
921 unsigned long buf_offset;
922 unsigned long current_buf_start;
923 unsigned long start_byte;
924 unsigned long working_bytes = total_out - buf_start;
925 unsigned long bytes;
926 char *kaddr;
927 struct page *page_out = bvec[*page_index].bv_page;
928
929 /*
930 * start byte is the first byte of the page we're currently
931 * copying into relative to the start of the compressed data.
932 */
933 start_byte = page_offset(page_out) - disk_start;
934
935 /* we haven't yet hit data corresponding to this page */
936 if (total_out <= start_byte)
937 return 1;
938
939 /*
940 * the start of the data we care about is offset into
941 * the middle of our working buffer
942 */
943 if (total_out > start_byte && buf_start < start_byte) {
944 buf_offset = start_byte - buf_start;
945 working_bytes -= buf_offset;
946 } else {
947 buf_offset = 0;
948 }
949 current_buf_start = buf_start;
950
951 /* copy bytes from the working buffer into the pages */
952 while (working_bytes > 0) {
953 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
954 PAGE_CACHE_SIZE - buf_offset);
955 bytes = min(bytes, working_bytes);
956 kaddr = kmap_atomic(page_out, KM_USER0);
957 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
958 kunmap_atomic(kaddr, KM_USER0);
959 flush_dcache_page(page_out);
960
961 *pg_offset += bytes;
962 buf_offset += bytes;
963 working_bytes -= bytes;
964 current_buf_start += bytes;
965
966 /* check if we need to pick another page */
967 if (*pg_offset == PAGE_CACHE_SIZE) {
968 (*page_index)++;
969 if (*page_index >= vcnt)
970 return 0;
971
972 page_out = bvec[*page_index].bv_page;
973 *pg_offset = 0;
974 start_byte = page_offset(page_out) - disk_start;
975
976 /*
977 * make sure our new page is covered by this
978 * working buffer
979 */
980 if (total_out <= start_byte)
981 return 1;
982
983 /*
984 * the next page in the biovec might not be adjacent
985 * to the last page, but it might still be found
986 * inside this working buffer. bump our offset pointer
987 */
988 if (total_out > start_byte &&
989 current_buf_start < start_byte) {
990 buf_offset = start_byte - buf_start;
991 working_bytes = total_out - start_byte;
992 current_buf_start = buf_start + buf_offset;
993 }
994 }
995 }
996
997 return 1;
998}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2515 2517
2516 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2517 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2519 2524
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2765 2770
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2767 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2769 2777
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b41..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h>
30#include <asm/kmap_types.h> 31#include <asm/kmap_types.h>
31#include "extent_io.h" 32#include "extent_io.h"
32#include "extent_map.h" 33#include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
294#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
295#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
296#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
297#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
298#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
299 308
@@ -398,13 +407,15 @@ struct btrfs_super_block {
398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
401 411
402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
404#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
408 419
409/* 420/*
410 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
551} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
552 563
553enum btrfs_compression_type { 564enum btrfs_compression_type {
554 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
555 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
556 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
557}; 570};
558 571
559struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
597 u8 type; 610 u8 type;
598} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
599 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
600struct btrfs_root_item { 615struct btrfs_root_item {
601 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
602 __le64 generation; 617 __le64 generation;
@@ -808,9 +823,9 @@ struct btrfs_block_group_cache {
808 int extents_thresh; 823 int extents_thresh;
809 int free_extents; 824 int free_extents;
810 int total_bitmaps; 825 int total_bitmaps;
811 int ro:1; 826 unsigned int ro:1;
812 int dirty:1; 827 unsigned int dirty:1;
813 int iref:1; 828 unsigned int iref:1;
814 829
815 int disk_cache_state; 830 int disk_cache_state;
816 831
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
895 */ 910 */
896 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
897 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
898 unsigned long mount_opt; 913 unsigned long mount_opt:20;
914 unsigned long compress_type:4;
899 u64 max_inline; 915 u64 max_inline;
900 u64 alloc_start; 916 u64 alloc_start;
901 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
1050 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
1051 1067
1052 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
1053}; 1072};
1054 1073
1055/* 1074/*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1893BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1894 last_snapshot, 64); 1913 last_snapshot, 64);
1895 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1896/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1897 1921
1898BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2145int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2146 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2147u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2148void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2149void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2150int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2188int btrfs_set_block_group_rw(struct btrfs_root *root, 2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2189 struct btrfs_block_group_cache *cache); 2214 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2221
2191/* ctree.c */ 2222/* ctree.c */
2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2193 int level, int *slot); 2224 int level, int *slot);
@@ -2541,10 +2572,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2541/* super.c */ 2572/* super.c */
2542int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2543int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2544 2583
2545/* acl.c */ 2584/* acl.c */
2546#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2547int btrfs_check_acl(struct inode *inode, int mask); 2586int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
2548#else 2587#else
2549#define btrfs_check_acl NULL 2588#define btrfs_check_acl NULL
2550#endif 2589#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d7181..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h>
31#include "compat.h" 32#include "compat.h"
32#include "ctree.h" 33#include "ctree.h"
33#include "disk-io.h" 34#include "disk-io.h"
@@ -43,6 +44,20 @@
43static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
45static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
46 61
47/* 62/*
48 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -352,9 +367,15 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
352 WARN_ON(len == 0); 367 WARN_ON(len == 0);
353 368
354 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
355 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
356 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
357 BUG_ON(ret); 376 BUG_ON(ret);
377 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
378
358 found_start = btrfs_header_bytenr(eb); 379 found_start = btrfs_header_bytenr(eb);
359 if (found_start != start) { 380 if (found_start != start) {
360 WARN_ON(1); 381 WARN_ON(1);
@@ -424,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
424 WARN_ON(len == 0); 445 WARN_ON(len == 0);
425 446
426 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
427 452
428 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
429 if (found_start != start) { 454 if (found_start != start) {
@@ -693,6 +718,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
693 __btree_submit_bio_done); 718 __btree_submit_bio_done);
694} 719}
695 720
721#ifdef CONFIG_MIGRATION
722static int btree_migratepage(struct address_space *mapping,
723 struct page *newpage, struct page *page)
724{
725 /*
726 * we can't safely write a btree page from here,
727 * we haven't done the locking hook
728 */
729 if (PageDirty(page))
730 return -EAGAIN;
731 /*
732 * Buffers may be managed in a filesystem specific way.
733 * We must have no buffers or drop them.
734 */
735 if (page_has_private(page) &&
736 !try_to_release_page(page, GFP_KERNEL))
737 return -EAGAIN;
738 return migrate_page(mapping, newpage, page);
739}
740#endif
741
696static int btree_writepage(struct page *page, struct writeback_control *wbc) 742static int btree_writepage(struct page *page, struct writeback_control *wbc)
697{ 743{
698 struct extent_io_tree *tree; 744 struct extent_io_tree *tree;
@@ -707,8 +753,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
707 } 753 }
708 754
709 redirty_page_for_writepage(wbc, page); 755 redirty_page_for_writepage(wbc, page);
710 eb = btrfs_find_tree_block(root, page_offset(page), 756 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
711 PAGE_CACHE_SIZE);
712 WARN_ON(!eb); 757 WARN_ON(!eb);
713 758
714 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 759 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +844,9 @@ static const struct address_space_operations btree_aops = {
799 .releasepage = btree_releasepage, 844 .releasepage = btree_releasepage,
800 .invalidatepage = btree_invalidatepage, 845 .invalidatepage = btree_invalidatepage,
801 .sync_page = block_sync_page, 846 .sync_page = block_sync_page,
847#ifdef CONFIG_MIGRATION
848 .migratepage = btree_migratepage,
849#endif
802}; 850};
803 851
804int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 852int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1029,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
981 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1029 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
982 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1030 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
983 blocksize, generation); 1031 blocksize, generation);
984 BUG_ON(!root->node); 1032 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1033 free_extent_buffer(root->node);
1034 return -EIO;
1035 }
985 root->commit_root = btrfs_root_node(root); 1036 root->commit_root = btrfs_root_node(root);
986 return 0; 1037 return 0;
987} 1038}
@@ -1116,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1116 } 1167 }
1117 btrfs_free_path(path); 1168 btrfs_free_path(path);
1118 if (ret) { 1169 if (ret) {
1170 kfree(root);
1119 if (ret > 0) 1171 if (ret > 0)
1120 ret = -ENOENT; 1172 ret = -ENOENT;
1121 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1538,10 +1590,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1538 GFP_NOFS); 1590 GFP_NOFS);
1539 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1591 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1540 GFP_NOFS); 1592 GFP_NOFS);
1541 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1593 struct btrfs_root *tree_root = btrfs_sb(sb);
1542 GFP_NOFS); 1594 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1543 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1544 GFP_NOFS);
1545 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1595 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1546 GFP_NOFS); 1596 GFP_NOFS);
1547 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1597 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1686,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1736 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1737
1688 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1738 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1689 if (!bh) 1739 if (!bh) {
1740 err = -EINVAL;
1690 goto fail_iput; 1741 goto fail_iput;
1742 }
1691 1743
1692 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1744 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1693 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1745 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1700,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1700 if (!btrfs_super_root(disk_super)) 1752 if (!btrfs_super_root(disk_super))
1701 goto fail_iput; 1753 goto fail_iput;
1702 1754
1755 /* check FS state, whether FS is broken. */
1756 fs_info->fs_state |= btrfs_super_flags(disk_super);
1757
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759
1703 ret = btrfs_parse_options(tree_root, options); 1760 ret = btrfs_parse_options(tree_root, options);
1704 if (ret) { 1761 if (ret) {
1705 err = ret; 1762 err = ret;
@@ -1717,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1717 } 1774 }
1718 1775
1719 features = btrfs_super_incompat_flags(disk_super); 1776 features = btrfs_super_incompat_flags(disk_super);
1720 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1777 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1721 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1778 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1722 btrfs_set_super_incompat_flags(disk_super, features); 1779 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1723 } 1780 btrfs_set_super_incompat_flags(disk_super, features);
1724 1781
1725 features = btrfs_super_compat_ro_flags(disk_super) & 1782 features = btrfs_super_compat_ro_flags(disk_super) &
1726 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1783 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1930,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1930 btrfs_set_opt(fs_info->mount_opt, SSD); 1987 btrfs_set_opt(fs_info->mount_opt, SSD);
1931 } 1988 }
1932 1989
1933 if (btrfs_super_log_root(disk_super) != 0) { 1990 /* do not make disk changes in broken FS */
1991 if (btrfs_super_log_root(disk_super) != 0 &&
1992 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1934 u64 bytenr = btrfs_super_log_root(disk_super); 1993 u64 bytenr = btrfs_super_log_root(disk_super);
1935 1994
1936 if (fs_devices->rw_devices == 0) { 1995 if (fs_devices->rw_devices == 0) {
@@ -2415,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
2415 smp_mb(); 2474 smp_mb();
2416 2475
2417 btrfs_put_block_group_cache(fs_info); 2476 btrfs_put_block_group_cache(fs_info);
2477
2478 /*
2479 * Here come 2 situations when btrfs is broken to flip readonly:
2480 *
2481 * 1. when btrfs flips readonly somewhere else before
2482 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2483 * and btrfs will skip to write sb directly to keep
2484 * ERROR state on disk.
2485 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then.
2490 */
2418 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2491 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2419 ret = btrfs_commit_super(root); 2492 ret = btrfs_commit_super(root);
2493 if (ret)
2494 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2495 }
2496
2497 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2498 ret = btrfs_error_commit_super(root);
2420 if (ret) 2499 if (ret)
2421 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2500 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2422 } 2501 }
@@ -2592,6 +2671,352 @@ out:
2592 return 0; 2671 return 0;
2593} 2672}
2594 2673
2674static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2675 int read_only)
2676{
2677 if (read_only)
2678 return;
2679
2680 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2681 printk(KERN_WARNING "warning: mount fs with errors, "
2682 "running btrfsck is recommended\n");
2683}
2684
2685int btrfs_error_commit_super(struct btrfs_root *root)
2686{
2687 int ret;
2688
2689 mutex_lock(&root->fs_info->cleaner_mutex);
2690 btrfs_run_delayed_iputs(root);
2691 mutex_unlock(&root->fs_info->cleaner_mutex);
2692
2693 down_write(&root->fs_info->cleanup_work_sem);
2694 up_write(&root->fs_info->cleanup_work_sem);
2695
2696 /* cleanup FS via transaction */
2697 btrfs_cleanup_transaction(root);
2698
2699 ret = write_ctree_super(NULL, root, 0);
2700
2701 return ret;
2702}
2703
2704static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2705{
2706 struct btrfs_inode *btrfs_inode;
2707 struct list_head splice;
2708
2709 INIT_LIST_HEAD(&splice);
2710
2711 mutex_lock(&root->fs_info->ordered_operations_mutex);
2712 spin_lock(&root->fs_info->ordered_extent_lock);
2713
2714 list_splice_init(&root->fs_info->ordered_operations, &splice);
2715 while (!list_empty(&splice)) {
2716 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2717 ordered_operations);
2718
2719 list_del_init(&btrfs_inode->ordered_operations);
2720
2721 btrfs_invalidate_inodes(btrfs_inode->root);
2722 }
2723
2724 spin_unlock(&root->fs_info->ordered_extent_lock);
2725 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2726
2727 return 0;
2728}
2729
2730static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2731{
2732 struct list_head splice;
2733 struct btrfs_ordered_extent *ordered;
2734 struct inode *inode;
2735
2736 INIT_LIST_HEAD(&splice);
2737
2738 spin_lock(&root->fs_info->ordered_extent_lock);
2739
2740 list_splice_init(&root->fs_info->ordered_extents, &splice);
2741 while (!list_empty(&splice)) {
2742 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2743 root_extent_list);
2744
2745 list_del_init(&ordered->root_extent_list);
2746 atomic_inc(&ordered->refs);
2747
2748 /* the inode may be getting freed (in sys_unlink path). */
2749 inode = igrab(ordered->inode);
2750
2751 spin_unlock(&root->fs_info->ordered_extent_lock);
2752 if (inode)
2753 iput(inode);
2754
2755 atomic_set(&ordered->refs, 1);
2756 btrfs_put_ordered_extent(ordered);
2757
2758 spin_lock(&root->fs_info->ordered_extent_lock);
2759 }
2760
2761 spin_unlock(&root->fs_info->ordered_extent_lock);
2762
2763 return 0;
2764}
2765
2766static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2767 struct btrfs_root *root)
2768{
2769 struct rb_node *node;
2770 struct btrfs_delayed_ref_root *delayed_refs;
2771 struct btrfs_delayed_ref_node *ref;
2772 int ret = 0;
2773
2774 delayed_refs = &trans->delayed_refs;
2775
2776 spin_lock(&delayed_refs->lock);
2777 if (delayed_refs->num_entries == 0) {
2778 printk(KERN_INFO "delayed_refs has NO entry\n");
2779 return ret;
2780 }
2781
2782 node = rb_first(&delayed_refs->root);
2783 while (node) {
2784 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2785 node = rb_next(node);
2786
2787 ref->in_tree = 0;
2788 rb_erase(&ref->rb_node, &delayed_refs->root);
2789 delayed_refs->num_entries--;
2790
2791 atomic_set(&ref->refs, 1);
2792 if (btrfs_delayed_ref_is_head(ref)) {
2793 struct btrfs_delayed_ref_head *head;
2794
2795 head = btrfs_delayed_node_to_head(ref);
2796 mutex_lock(&head->mutex);
2797 kfree(head->extent_op);
2798 delayed_refs->num_heads--;
2799 if (list_empty(&head->cluster))
2800 delayed_refs->num_heads_ready--;
2801 list_del_init(&head->cluster);
2802 mutex_unlock(&head->mutex);
2803 }
2804
2805 spin_unlock(&delayed_refs->lock);
2806 btrfs_put_delayed_ref(ref);
2807
2808 cond_resched();
2809 spin_lock(&delayed_refs->lock);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813
2814 return ret;
2815}
2816
2817static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2818{
2819 struct btrfs_pending_snapshot *snapshot;
2820 struct list_head splice;
2821
2822 INIT_LIST_HEAD(&splice);
2823
2824 list_splice_init(&t->pending_snapshots, &splice);
2825
2826 while (!list_empty(&splice)) {
2827 snapshot = list_entry(splice.next,
2828 struct btrfs_pending_snapshot,
2829 list);
2830
2831 list_del_init(&snapshot->list);
2832
2833 kfree(snapshot);
2834 }
2835
2836 return 0;
2837}
2838
2839static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2840{
2841 struct btrfs_inode *btrfs_inode;
2842 struct list_head splice;
2843
2844 INIT_LIST_HEAD(&splice);
2845
2846 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2847
2848 spin_lock(&root->fs_info->delalloc_lock);
2849
2850 while (!list_empty(&splice)) {
2851 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2852 delalloc_inodes);
2853
2854 list_del_init(&btrfs_inode->delalloc_inodes);
2855
2856 btrfs_invalidate_inodes(btrfs_inode->root);
2857 }
2858
2859 spin_unlock(&root->fs_info->delalloc_lock);
2860
2861 return 0;
2862}
2863
2864static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2865 struct extent_io_tree *dirty_pages,
2866 int mark)
2867{
2868 int ret;
2869 struct page *page;
2870 struct inode *btree_inode = root->fs_info->btree_inode;
2871 struct extent_buffer *eb;
2872 u64 start = 0;
2873 u64 end;
2874 u64 offset;
2875 unsigned long index;
2876
2877 while (1) {
2878 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2879 mark);
2880 if (ret)
2881 break;
2882
2883 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2884 while (start <= end) {
2885 index = start >> PAGE_CACHE_SHIFT;
2886 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2887 page = find_get_page(btree_inode->i_mapping, index);
2888 if (!page)
2889 continue;
2890 offset = page_offset(page);
2891
2892 spin_lock(&dirty_pages->buffer_lock);
2893 eb = radix_tree_lookup(
2894 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2895 offset >> PAGE_CACHE_SHIFT);
2896 spin_unlock(&dirty_pages->buffer_lock);
2897 if (eb) {
2898 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2899 &eb->bflags);
2900 atomic_set(&eb->refs, 1);
2901 }
2902 if (PageWriteback(page))
2903 end_page_writeback(page);
2904
2905 lock_page(page);
2906 if (PageDirty(page)) {
2907 clear_page_dirty_for_io(page);
2908 spin_lock_irq(&page->mapping->tree_lock);
2909 radix_tree_tag_clear(&page->mapping->page_tree,
2910 page_index(page),
2911 PAGECACHE_TAG_DIRTY);
2912 spin_unlock_irq(&page->mapping->tree_lock);
2913 }
2914
2915 page->mapping->a_ops->invalidatepage(page, 0);
2916 unlock_page(page);
2917 }
2918 }
2919
2920 return ret;
2921}
2922
2923static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2924 struct extent_io_tree *pinned_extents)
2925{
2926 struct extent_io_tree *unpin;
2927 u64 start;
2928 u64 end;
2929 int ret;
2930
2931 unpin = pinned_extents;
2932 while (1) {
2933 ret = find_first_extent_bit(unpin, 0, &start, &end,
2934 EXTENT_DIRTY);
2935 if (ret)
2936 break;
2937
2938 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2940
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end);
2943 cond_resched();
2944 }
2945
2946 return 0;
2947}
2948
2949static int btrfs_cleanup_transaction(struct btrfs_root *root)
2950{
2951 struct btrfs_transaction *t;
2952 LIST_HEAD(list);
2953
2954 WARN_ON(1);
2955
2956 mutex_lock(&root->fs_info->trans_mutex);
2957 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2958
2959 list_splice_init(&root->fs_info->trans_list, &list);
2960 while (!list_empty(&list)) {
2961 t = list_entry(list.next, struct btrfs_transaction, list);
2962 if (!t)
2963 break;
2964
2965 btrfs_destroy_ordered_operations(root);
2966
2967 btrfs_destroy_ordered_extents(root);
2968
2969 btrfs_destroy_delayed_refs(t, root);
2970
2971 btrfs_block_rsv_release(root,
2972 &root->fs_info->trans_block_rsv,
2973 t->dirty_pages.dirty_bytes);
2974
2975 /* FIXME: cleanup wait for commit */
2976 t->in_commit = 1;
2977 t->blocked = 1;
2978 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2979 wake_up(&root->fs_info->transaction_blocked_wait);
2980
2981 t->blocked = 0;
2982 if (waitqueue_active(&root->fs_info->transaction_wait))
2983 wake_up(&root->fs_info->transaction_wait);
2984 mutex_unlock(&root->fs_info->trans_mutex);
2985
2986 mutex_lock(&root->fs_info->trans_mutex);
2987 t->commit_done = 1;
2988 if (waitqueue_active(&t->commit_wait))
2989 wake_up(&t->commit_wait);
2990 mutex_unlock(&root->fs_info->trans_mutex);
2991
2992 mutex_lock(&root->fs_info->trans_mutex);
2993
2994 btrfs_destroy_pending_snapshots(t);
2995
2996 btrfs_destroy_delalloc_inodes(root);
2997
2998 spin_lock(&root->fs_info->new_trans_lock);
2999 root->fs_info->running_transaction = NULL;
3000 spin_unlock(&root->fs_info->new_trans_lock);
3001
3002 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3003 EXTENT_DIRTY);
3004
3005 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents);
3007
3008 t->use_count = 0;
3009 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t);
3012 }
3013
3014 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3015 mutex_unlock(&root->fs_info->trans_mutex);
3016
3017 return 0;
3018}
3019
2595static struct extent_io_ops btree_extent_io_ops = { 3020static struct extent_io_ops btree_extent_io_ops = {
2596 .write_cache_pages_lock_hook = btree_lock_page_hook, 3021 .write_cache_pages_lock_hook = btree_lock_page_hook,
2597 .readpage_end_io_hook = btree_readpage_end_io_hook, 3022 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f4..9786963b07e5 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 68 struct inode *inode;
70 struct btrfs_key key; 69 struct btrfs_key key;
71 int index; 70 int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 107 return ERR_PTR(-ESTALE);
109 } 108 }
110 109
111 dentry = d_obtain_alias(inode); 110 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail: 111fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 112 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 113 return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 162static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 163{
168 struct inode *dir = child->d_inode; 164 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 165 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 166 struct btrfs_path *path;
172 struct extent_buffer *leaf; 167 struct extent_buffer *leaf;
@@ -223,18 +218,91 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 218
224 key.type = BTRFS_INODE_ITEM_KEY; 219 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 220 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 221 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail: 222fail:
231 btrfs_free_path(path); 223 btrfs_free_path(path);
232 return ERR_PTR(ret); 224 return ERR_PTR(ret);
233} 225}
234 226
227static int btrfs_get_name(struct dentry *parent, char *name,
228 struct dentry *child)
229{
230 struct inode *inode = child->d_inode;
231 struct inode *dir = parent->d_inode;
232 struct btrfs_path *path;
233 struct btrfs_root *root = BTRFS_I(dir)->root;
234 struct btrfs_inode_ref *iref;
235 struct btrfs_root_ref *rref;
236 struct extent_buffer *leaf;
237 unsigned long name_ptr;
238 struct btrfs_key key;
239 int name_len;
240 int ret;
241
242 if (!dir || !inode)
243 return -EINVAL;
244
245 if (!S_ISDIR(dir->i_mode))
246 return -EINVAL;
247
248 path = btrfs_alloc_path();
249 if (!path)
250 return -ENOMEM;
251 path->leave_spinning = 1;
252
253 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
254 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
255 key.type = BTRFS_ROOT_BACKREF_KEY;
256 key.offset = (u64)-1;
257 root = root->fs_info->tree_root;
258 } else {
259 key.objectid = inode->i_ino;
260 key.offset = dir->i_ino;
261 key.type = BTRFS_INODE_REF_KEY;
262 }
263
264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
265 if (ret < 0) {
266 btrfs_free_path(path);
267 return ret;
268 } else if (ret > 0) {
269 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
270 path->slots[0]--;
271 } else {
272 btrfs_free_path(path);
273 return -ENOENT;
274 }
275 }
276 leaf = path->nodes[0];
277
278 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
279 rref = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_root_ref);
281 name_ptr = (unsigned long)(rref + 1);
282 name_len = btrfs_root_ref_name_len(leaf, rref);
283 } else {
284 iref = btrfs_item_ptr(leaf, path->slots[0],
285 struct btrfs_inode_ref);
286 name_ptr = (unsigned long)(iref + 1);
287 name_len = btrfs_inode_ref_name_len(leaf, iref);
288 }
289
290 read_extent_buffer(leaf, name, name_ptr, name_len);
291 btrfs_free_path(path);
292
293 /*
294 * have to add the null termination to make sure that reconnect_path
295 * gets the right len for strlen
296 */
297 name[name_len] = '\0';
298
299 return 0;
300}
301
235const struct export_operations btrfs_export_ops = { 302const struct export_operations btrfs_export_ops = {
236 .encode_fh = btrfs_encode_fh, 303 .encode_fh = btrfs_encode_fh,
237 .fh_to_dentry = btrfs_fh_to_dentry, 304 .fh_to_dentry = btrfs_fh_to_dentry,
238 .fh_to_parent = btrfs_fh_to_parent, 305 .fh_to_parent = btrfs_fh_to_parent,
239 .get_parent = btrfs_get_parent, 306 .get_parent = btrfs_get_parent,
307 .get_name = btrfs_get_name,
240}; 308};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec41..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
429 429
430static int cache_block_group(struct btrfs_block_group_cache *cache, 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans, 431 struct btrfs_trans_handle *trans,
432 struct btrfs_root *root,
432 int load_cache_only) 433 int load_cache_only)
433{ 434{
434 struct btrfs_fs_info *fs_info = cache->fs_info; 435 struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
442 443
443 /* 444 /*
444 * We can't do the read from on-disk cache during a commit since we need 445 * We can't do the read from on-disk cache during a commit since we need
445 * to have the normal tree locking. 446 * to have the normal tree locking. Also if we are currently trying to
447 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks.
446 */ 449 */
447 if (!trans->transaction->in_commit) { 450 if (!trans->transaction->in_commit &&
451 (root && root != root->fs_info->tree_root)) {
448 spin_lock(&cache->lock); 452 spin_lock(&cache->lock);
449 if (cache->cached != BTRFS_CACHE_NO) { 453 if (cache->cached != BTRFS_CACHE_NO) {
450 spin_unlock(&cache->lock); 454 spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2741 struct btrfs_root *root = block_group->fs_info->tree_root; 2745 struct btrfs_root *root = block_group->fs_info->tree_root;
2742 struct inode *inode = NULL; 2746 struct inode *inode = NULL;
2743 u64 alloc_hint = 0; 2747 u64 alloc_hint = 0;
2748 int dcs = BTRFS_DC_ERROR;
2744 int num_pages = 0; 2749 int num_pages = 0;
2745 int retries = 0; 2750 int retries = 0;
2746 int ret = 0; 2751 int ret = 0;
@@ -2795,6 +2800,8 @@ again:
2795 2800
2796 spin_lock(&block_group->lock); 2801 spin_lock(&block_group->lock);
2797 if (block_group->cached != BTRFS_CACHE_FINISHED) { 2802 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2803 /* We're not cached, don't bother trying to write stuff out */
2804 dcs = BTRFS_DC_WRITTEN;
2798 spin_unlock(&block_group->lock); 2805 spin_unlock(&block_group->lock);
2799 goto out_put; 2806 goto out_put;
2800 } 2807 }
@@ -2821,6 +2828,8 @@ again:
2821 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2828 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2822 num_pages, num_pages, 2829 num_pages, num_pages,
2823 &alloc_hint); 2830 &alloc_hint);
2831 if (!ret)
2832 dcs = BTRFS_DC_SETUP;
2824 btrfs_free_reserved_data_space(inode, num_pages); 2833 btrfs_free_reserved_data_space(inode, num_pages);
2825out_put: 2834out_put:
2826 iput(inode); 2835 iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
2828 btrfs_release_path(root, path); 2837 btrfs_release_path(root, path);
2829out: 2838out:
2830 spin_lock(&block_group->lock); 2839 spin_lock(&block_group->lock);
2831 if (ret) 2840 block_group->disk_cache_state = dcs;
2832 block_group->disk_cache_state = BTRFS_DC_ERROR;
2833 else
2834 block_group->disk_cache_state = BTRFS_DC_SETUP;
2835 spin_unlock(&block_group->lock); 2841 spin_unlock(&block_group->lock);
2836 2842
2837 return ret; 2843 return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3037 3043
3038u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3044u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3039{ 3045{
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3046 /*
3047 * we add in the count of missing devices because we want
3048 * to make sure that any RAID levels on a degraded FS
3049 * continue to be honored.
3050 */
3051 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3052 root->fs_info->fs_devices->missing_devices;
3041 3053
3042 if (num_devices == 1) 3054 if (num_devices == 1)
3043 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3055 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3077,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3077 return btrfs_reduce_alloc_profile(root, flags); 3089 return btrfs_reduce_alloc_profile(root, flags);
3078} 3090}
3079 3091
3080static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3092u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3081{ 3093{
3082 u64 flags; 3094 u64 flags;
3083 3095
@@ -3149,8 +3161,12 @@ alloc:
3149 bytes + 2 * 1024 * 1024, 3161 bytes + 2 * 1024 * 1024,
3150 alloc_target, 0); 3162 alloc_target, 0);
3151 btrfs_end_transaction(trans, root); 3163 btrfs_end_transaction(trans, root);
3152 if (ret < 0) 3164 if (ret < 0) {
3153 return ret; 3165 if (ret != -ENOSPC)
3166 return ret;
3167 else
3168 goto commit_trans;
3169 }
3154 3170
3155 if (!data_sinfo) { 3171 if (!data_sinfo) {
3156 btrfs_set_inode_space_info(root, inode); 3172 btrfs_set_inode_space_info(root, inode);
@@ -3161,6 +3177,7 @@ alloc:
3161 spin_unlock(&data_sinfo->lock); 3177 spin_unlock(&data_sinfo->lock);
3162 3178
3163 /* commit the current transaction and try again */ 3179 /* commit the current transaction and try again */
3180commit_trans:
3164 if (!committed && !root->fs_info->open_ioctl_trans) { 3181 if (!committed && !root->fs_info->open_ioctl_trans) {
3165 committed = 1; 3182 committed = 1;
3166 trans = btrfs_join_transaction(root, 1); 3183 trans = btrfs_join_transaction(root, 1);
@@ -3412,7 +3429,7 @@ again:
3412 * our reservation. 3429 * our reservation.
3413 */ 3430 */
3414 if (unused <= space_info->total_bytes) { 3431 if (unused <= space_info->total_bytes) {
3415 unused -= space_info->total_bytes; 3432 unused = space_info->total_bytes - unused;
3416 if (unused >= num_bytes) { 3433 if (unused >= num_bytes) {
3417 if (!reserved) 3434 if (!reserved)
3418 space_info->bytes_reserved += orig_bytes; 3435 space_info->bytes_reserved += orig_bytes;
@@ -3709,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3709 return 0; 3726 return 0;
3710 } 3727 }
3711 3728
3712 WARN_ON(1);
3713 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3714 block_rsv->size, block_rsv->reserved,
3715 block_rsv->freed[0], block_rsv->freed[1]);
3716
3717 return -ENOSPC; 3729 return -ENOSPC;
3718} 3730}
3719 3731
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4080 * space back to the block group, otherwise we will leak space. 4092 * space back to the block group, otherwise we will leak space.
4081 */ 4093 */
4082 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4094 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4083 cache_block_group(cache, trans, 1); 4095 cache_block_group(cache, trans, NULL, 1);
4084 4096
4085 byte_in_group = bytenr - cache->key.objectid; 4097 byte_in_group = bytenr - cache->key.objectid;
4086 WARN_ON(byte_in_group > cache->key.offset); 4098 WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
4930 btrfs_get_block_group(block_group); 4942 btrfs_get_block_group(block_group);
4931 search_start = block_group->key.objectid; 4943 search_start = block_group->key.objectid;
4932 4944
4945 /*
4946 * this can happen if we end up cycling through all the
4947 * raid types, but we want to make sure we only allocate
4948 * for the proper type.
4949 */
4950 if (!block_group_bits(block_group, data)) {
4951 u64 extra = BTRFS_BLOCK_GROUP_DUP |
4952 BTRFS_BLOCK_GROUP_RAID1 |
4953 BTRFS_BLOCK_GROUP_RAID10;
4954
4955 /*
4956 * if they asked for extra copies and this block group
4957 * doesn't provide them, bail. This does allow us to
4958 * fill raid0 from raid1.
4959 */
4960 if ((data & extra) && !(block_group->flags & extra))
4961 goto loop;
4962 }
4963
4933have_block_group: 4964have_block_group:
4934 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4965 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4935 u64 free_percent; 4966 u64 free_percent;
4936 4967
4937 ret = cache_block_group(block_group, trans, 1); 4968 ret = cache_block_group(block_group, trans,
4969 orig_root, 1);
4938 if (block_group->cached == BTRFS_CACHE_FINISHED) 4970 if (block_group->cached == BTRFS_CACHE_FINISHED)
4939 goto have_block_group; 4971 goto have_block_group;
4940 4972
@@ -4958,7 +4990,8 @@ have_block_group:
4958 if (loop > LOOP_CACHING_NOWAIT || 4990 if (loop > LOOP_CACHING_NOWAIT ||
4959 (loop > LOOP_FIND_IDEAL && 4991 (loop > LOOP_FIND_IDEAL &&
4960 atomic_read(&space_info->caching_threads) < 2)) { 4992 atomic_read(&space_info->caching_threads) < 2)) {
4961 ret = cache_block_group(block_group, trans, 0); 4993 ret = cache_block_group(block_group, trans,
4994 orig_root, 0);
4962 BUG_ON(ret); 4995 BUG_ON(ret);
4963 } 4996 }
4964 found_uncached_bg = true; 4997 found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5515 u64 num_bytes = ins->offset; 5548 u64 num_bytes = ins->offset;
5516 5549
5517 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5550 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5518 cache_block_group(block_group, trans, 0); 5551 cache_block_group(block_group, trans, NULL, 0);
5519 caching_ctl = get_caching_control(block_group); 5552 caching_ctl = get_caching_control(block_group);
5520 5553
5521 if (!caching_ctl) { 5554 if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6300 NULL, NULL); 6333 NULL, NULL);
6301 BUG_ON(ret < 0); 6334 BUG_ON(ret < 0);
6302 if (ret > 0) { 6335 if (ret > 0) {
6303 ret = btrfs_del_orphan_item(trans, tree_root, 6336 /* if we fail to delete the orphan item this time
6304 root->root_key.objectid); 6337 * around, it'll get picked up the next time.
6305 BUG_ON(ret); 6338 *
6339 * The most common failure here is just -ENOENT.
6340 */
6341 btrfs_del_orphan_item(trans, tree_root,
6342 root->root_key.objectid);
6306 } 6343 }
6307 } 6344 }
6308 6345
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7878 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7915 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7879 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7916 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7880 7917
7881 num_devices = root->fs_info->fs_devices->rw_devices; 7918 /*
7919 * we add in the count of missing devices because we want
7920 * to make sure that any RAID levels on a degraded FS
7921 * continue to be honored.
7922 */
7923 num_devices = root->fs_info->fs_devices->rw_devices +
7924 root->fs_info->fs_devices->missing_devices;
7925
7882 if (num_devices == 1) { 7926 if (num_devices == 1) {
7883 stripped |= BTRFS_BLOCK_GROUP_DUP; 7927 stripped |= BTRFS_BLOCK_GROUP_DUP;
7884 stripped = flags & ~stripped; 7928 stripped = flags & ~stripped;
@@ -7926,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7926 7970
7927 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7928 sinfo->bytes_may_use + sinfo->bytes_readonly + 7972 sinfo->bytes_may_use + sinfo->bytes_readonly +
7929 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 7973 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7930 sinfo->bytes_readonly += num_bytes; 7974 sinfo->bytes_readonly += num_bytes;
7931 sinfo->bytes_reserved += cache->reserved_pinned; 7975 sinfo->bytes_reserved += cache->reserved_pinned;
7932 cache->reserved_pinned = 0; 7976 cache->reserved_pinned = 0;
7933 cache->ro = 1; 7977 cache->ro = 1;
7934 ret = 0; 7978 ret = 0;
7935 } 7979 }
7980
7936 spin_unlock(&cache->lock); 7981 spin_unlock(&cache->lock);
7937 spin_unlock(&sinfo->lock); 7982 spin_unlock(&sinfo->lock);
7938 return ret; 7983 return ret;
@@ -7968,6 +8013,62 @@ out:
7968 return ret; 8013 return ret;
7969} 8014}
7970 8015
8016/*
8017 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account.
8019 */
8020static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8021{
8022 struct btrfs_block_group_cache *block_group;
8023 u64 free_bytes = 0;
8024 int factor;
8025
8026 list_for_each_entry(block_group, groups_list, list) {
8027 spin_lock(&block_group->lock);
8028
8029 if (!block_group->ro) {
8030 spin_unlock(&block_group->lock);
8031 continue;
8032 }
8033
8034 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8035 BTRFS_BLOCK_GROUP_RAID10 |
8036 BTRFS_BLOCK_GROUP_DUP))
8037 factor = 2;
8038 else
8039 factor = 1;
8040
8041 free_bytes += (block_group->key.offset -
8042 btrfs_block_group_used(&block_group->item)) *
8043 factor;
8044
8045 spin_unlock(&block_group->lock);
8046 }
8047
8048 return free_bytes;
8049}
8050
8051/*
8052 * helper to account the unused space of all the readonly block group in the
8053 * space_info. takes mirrors into account.
8054 */
8055u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8056{
8057 int i;
8058 u64 free_bytes = 0;
8059
8060 spin_lock(&sinfo->lock);
8061
8062 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8063 if (!list_empty(&sinfo->block_groups[i]))
8064 free_bytes += __btrfs_get_ro_block_group_free_space(
8065 &sinfo->block_groups[i]);
8066
8067 spin_unlock(&sinfo->lock);
8068
8069 return free_bytes;
8070}
8071
7971int btrfs_set_block_group_rw(struct btrfs_root *root, 8072int btrfs_set_block_group_rw(struct btrfs_root *root,
7972 struct btrfs_block_group_cache *cache) 8073 struct btrfs_block_group_cache *cache)
7973{ 8074{
@@ -8048,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8048 mutex_lock(&root->fs_info->chunk_mutex); 8149 mutex_lock(&root->fs_info->chunk_mutex);
8049 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8150 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8050 u64 min_free = btrfs_block_group_used(&block_group->item); 8151 u64 min_free = btrfs_block_group_used(&block_group->item);
8051 u64 dev_offset, max_avail; 8152 u64 dev_offset;
8052 8153
8053 /* 8154 /*
8054 * check to make sure we can actually find a chunk with enough 8155 * check to make sure we can actually find a chunk with enough
@@ -8056,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8056 */ 8157 */
8057 if (device->total_bytes > device->bytes_used + min_free) { 8158 if (device->total_bytes > device->bytes_used + min_free) {
8058 ret = find_free_dev_extent(NULL, device, min_free, 8159 ret = find_free_dev_extent(NULL, device, min_free,
8059 &dev_offset, &max_avail); 8160 &dev_offset, NULL);
8060 if (!ret) 8161 if (!ret)
8061 break; 8162 break;
8062 ret = -1; 8163 ret = -1;
@@ -8247,7 +8348,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8247 break; 8348 break;
8248 if (ret != 0) 8349 if (ret != 0)
8249 goto error; 8350 goto error;
8250
8251 leaf = path->nodes[0]; 8351 leaf = path->nodes[0];
8252 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8352 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8253 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8353 cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -8541,3 +8641,14 @@ out:
8541 btrfs_free_path(path); 8641 btrfs_free_path(path);
8542 return ret; 8642 return ret;
8543} 8643}
8644
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{
8647 return unpin_extent_range(root, start, end);
8648}
8649
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes)
8652{
8653 return btrfs_discard_extent(root, bytenr, num_bytes);
8654}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a9..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1828 bio_put(bio); 1828 bio_put(bio);
1829} 1829}
1830 1830
1831static struct bio * 1831struct bio *
1832extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1832btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1833 gfp_t gfp_flags) 1833 gfp_t gfp_flags)
1834{ 1834{
1835 struct bio *bio; 1835 struct bio *bio;
1836 1836
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1919 else 1919 else
1920 nr = bio_get_nr_vecs(bdev); 1920 nr = bio_get_nr_vecs(bdev);
1921 1921
1922 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1923 1923
1924 bio_add_page(bio, page, page_size, offset); 1924 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1925 bio->bi_end_io = end_io_func;
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2030 2030
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2033 2036
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -2901,21 +2904,53 @@ out:
2901int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2904int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2902 __u64 start, __u64 len, get_extent_t *get_extent) 2905 __u64 start, __u64 len, get_extent_t *get_extent)
2903{ 2906{
2904 int ret; 2907 int ret = 0;
2905 u64 off = start; 2908 u64 off = start;
2906 u64 max = start + len; 2909 u64 max = start + len;
2907 u32 flags = 0; 2910 u32 flags = 0;
2911 u32 found_type;
2912 u64 last;
2908 u64 disko = 0; 2913 u64 disko = 0;
2914 struct btrfs_key found_key;
2909 struct extent_map *em = NULL; 2915 struct extent_map *em = NULL;
2910 struct extent_state *cached_state = NULL; 2916 struct extent_state *cached_state = NULL;
2917 struct btrfs_path *path;
2918 struct btrfs_file_extent_item *item;
2911 int end = 0; 2919 int end = 0;
2912 u64 em_start = 0, em_len = 0; 2920 u64 em_start = 0, em_len = 0;
2913 unsigned long emflags; 2921 unsigned long emflags;
2914 ret = 0; 2922 int hole = 0;
2915 2923
2916 if (len == 0) 2924 if (len == 0)
2917 return -EINVAL; 2925 return -EINVAL;
2918 2926
2927 path = btrfs_alloc_path();
2928 if (!path)
2929 return -ENOMEM;
2930 path->leave_spinning = 1;
2931
2932 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2933 path, inode->i_ino, -1, 0);
2934 if (ret < 0) {
2935 btrfs_free_path(path);
2936 return ret;
2937 }
2938 WARN_ON(!ret);
2939 path->slots[0]--;
2940 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2941 struct btrfs_file_extent_item);
2942 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2943 found_type = btrfs_key_type(&found_key);
2944
2945 /* No extents, just return */
2946 if (found_key.objectid != inode->i_ino ||
2947 found_type != BTRFS_EXTENT_DATA_KEY) {
2948 btrfs_free_path(path);
2949 return 0;
2950 }
2951 last = found_key.offset;
2952 btrfs_free_path(path);
2953
2919 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2954 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2920 &cached_state, GFP_NOFS); 2955 &cached_state, GFP_NOFS);
2921 em = get_extent(inode, NULL, 0, off, max - off, 0); 2956 em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2960,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2925 ret = PTR_ERR(em); 2960 ret = PTR_ERR(em);
2926 goto out; 2961 goto out;
2927 } 2962 }
2963
2928 while (!end) { 2964 while (!end) {
2965 hole = 0;
2929 off = em->start + em->len; 2966 off = em->start + em->len;
2930 if (off >= max) 2967 if (off >= max)
2931 end = 1; 2968 end = 1;
2932 2969
2970 if (em->block_start == EXTENT_MAP_HOLE) {
2971 hole = 1;
2972 goto next;
2973 }
2974
2933 em_start = em->start; 2975 em_start = em->start;
2934 em_len = em->len; 2976 em_len = em->len;
2935 2977
@@ -2939,8 +2981,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2939 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2981 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2940 end = 1; 2982 end = 1;
2941 flags |= FIEMAP_EXTENT_LAST; 2983 flags |= FIEMAP_EXTENT_LAST;
2942 } else if (em->block_start == EXTENT_MAP_HOLE) {
2943 flags |= FIEMAP_EXTENT_UNWRITTEN;
2944 } else if (em->block_start == EXTENT_MAP_INLINE) { 2984 } else if (em->block_start == EXTENT_MAP_INLINE) {
2945 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2985 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2946 FIEMAP_EXTENT_NOT_ALIGNED); 2986 FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2993,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2953 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2993 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2954 flags |= FIEMAP_EXTENT_ENCODED; 2994 flags |= FIEMAP_EXTENT_ENCODED;
2955 2995
2996next:
2956 emflags = em->flags; 2997 emflags = em->flags;
2957 free_extent_map(em); 2998 free_extent_map(em);
2958 em = NULL; 2999 em = NULL;
2959
2960 if (!end) { 3000 if (!end) {
2961 em = get_extent(inode, NULL, 0, off, max - off, 0); 3001 em = get_extent(inode, NULL, 0, off, max - off, 0);
2962 if (!em) 3002 if (!em)
@@ -2967,15 +3007,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2967 } 3007 }
2968 emflags = em->flags; 3008 emflags = em->flags;
2969 } 3009 }
3010
2970 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3011 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2971 flags |= FIEMAP_EXTENT_LAST; 3012 flags |= FIEMAP_EXTENT_LAST;
2972 end = 1; 3013 end = 1;
2973 } 3014 }
2974 3015
2975 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3016 if (em_start == last) {
2976 em_len, flags); 3017 flags |= FIEMAP_EXTENT_LAST;
2977 if (ret) 3018 end = 1;
2978 goto out_free; 3019 }
3020
3021 if (!hole) {
3022 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3023 em_len, flags);
3024 if (ret)
3025 goto out_free;
3026 }
2979 } 3027 }
2980out_free: 3028out_free:
2981 free_extent_map(em); 3029 free_extent_map(em);
@@ -3027,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3027#endif 3075#endif
3028 3076
3029 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3077 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078 if (eb == NULL)
3079 return NULL;
3030 eb->start = start; 3080 eb->start = start;
3031 eb->len = len; 3081 eb->len = len;
3032 spin_lock_init(&eb->lock); 3082 spin_lock_init(&eb->lock);
@@ -3836,8 +3886,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3836 3886
3837 spin_lock(&tree->buffer_lock); 3887 spin_lock(&tree->buffer_lock);
3838 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3888 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3839 if (!eb) 3889 if (!eb) {
3840 goto out; 3890 spin_unlock(&tree->buffer_lock);
3891 return ret;
3892 }
3841 3893
3842 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3894 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3843 ret = 0; 3895 ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef7..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
136}; 140};
137 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
138struct extent_map_tree; 153struct extent_map_tree;
139 154
140static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
@@ -310,4 +325,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
310 struct extent_io_tree *tree, 325 struct extent_io_tree *tree,
311 u64 start, u64 end, struct page *locked_page, 326 u64 start, u64 end, struct page *locked_page,
312 unsigned long op); 327 unsigned long op);
328struct bio *
329btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
330 gfp_t gfp_flags);
313#endif 331#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
54 return em; 55 return em;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -48,30 +49,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 struct page **prepared_pages, 49 struct page **prepared_pages,
49 struct iov_iter *i) 50 struct iov_iter *i)
50{ 51{
51 size_t copied; 52 size_t copied = 0;
52 int pg = 0; 53 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
54 56
55 while (write_bytes > 0) { 57 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 59 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[pg]; 60 struct page *page = prepared_pages[pg];
59again: 61 /*
60 if (unlikely(iov_iter_fault_in_readable(i, count))) 62 * Copy data from userspace to the current page
61 return -EFAULT; 63 *
62 64 * Disable pagefault to avoid recursive lock since
63 /* Copy data from userspace to the current page */ 65 * the pages are already locked
64 copied = iov_iter_copy_from_user(page, i, offset, count); 66 */
67 pagefault_disable();
68 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
69 pagefault_enable();
65 70
66 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
67 flush_dcache_page(page); 72 flush_dcache_page(page);
68 iov_iter_advance(i, copied); 73 iov_iter_advance(i, copied);
69 write_bytes -= copied; 74 write_bytes -= copied;
75 total_copied += copied;
70 76
77 /* Return to btrfs_file_aio_write to fault page */
71 if (unlikely(copied == 0)) { 78 if (unlikely(copied == 0)) {
72 count = min_t(size_t, PAGE_CACHE_SIZE - offset, 79 break;
73 iov_iter_single_seg_count(i));
74 goto again;
75 } 80 }
76 81
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 82 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +86,7 @@ again:
81 offset = 0; 86 offset = 0;
82 } 87 }
83 } 88 }
84 return 0; 89 return total_copied;
85} 90}
86 91
87/* 92/*
@@ -220,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
220 225
221 split->bdev = em->bdev; 226 split->bdev = em->bdev;
222 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
223 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
224 BUG_ON(ret); 230 BUG_ON(ret);
225 free_extent_map(split); 231 free_extent_map(split);
@@ -234,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
234 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
235 split->bdev = em->bdev; 241 split->bdev = em->bdev;
236 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
237 244
238 if (compressed) { 245 if (compressed) {
239 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -854,6 +861,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
854 unsigned long last_index; 861 unsigned long last_index;
855 int will_write; 862 int will_write;
856 int buffered = 0; 863 int buffered = 0;
864 int copied = 0;
865 int dirty_pages = 0;
857 866
858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 867 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
859 (file->f_flags & O_DIRECT)); 868 (file->f_flags & O_DIRECT));
@@ -884,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
884 if (err) 893 if (err)
885 goto out; 894 goto out;
886 895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
887 file_update_time(file); 907 file_update_time(file);
888 BTRFS_I(inode)->sequence++; 908 BTRFS_I(inode)->sequence++;
889 909
@@ -970,7 +990,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
970 WARN_ON(num_pages > nrptrs); 990 WARN_ON(num_pages > nrptrs);
971 memset(pages, 0, sizeof(struct page *) * nrptrs); 991 memset(pages, 0, sizeof(struct page *) * nrptrs);
972 992
973 ret = btrfs_delalloc_reserve_space(inode, write_bytes); 993 /*
994 * Fault pages before locking them in prepare_pages
995 * to avoid recursive lock
996 */
997 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
998 ret = -EFAULT;
999 goto out;
1000 }
1001
1002 ret = btrfs_delalloc_reserve_space(inode,
1003 num_pages << PAGE_CACHE_SHIFT);
974 if (ret) 1004 if (ret)
975 goto out; 1005 goto out;
976 1006
@@ -978,37 +1008,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
978 pos, first_index, last_index, 1008 pos, first_index, last_index,
979 write_bytes); 1009 write_bytes);
980 if (ret) { 1010 if (ret) {
981 btrfs_delalloc_release_space(inode, write_bytes); 1011 btrfs_delalloc_release_space(inode,
1012 num_pages << PAGE_CACHE_SHIFT);
982 goto out; 1013 goto out;
983 } 1014 }
984 1015
985 ret = btrfs_copy_from_user(pos, num_pages, 1016 copied = btrfs_copy_from_user(pos, num_pages,
986 write_bytes, pages, &i); 1017 write_bytes, pages, &i);
987 if (ret == 0) { 1018 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
1019 PAGE_CACHE_SHIFT;
1020
1021 if (num_pages > dirty_pages) {
1022 if (copied > 0)
1023 atomic_inc(
1024 &BTRFS_I(inode)->outstanding_extents);
1025 btrfs_delalloc_release_space(inode,
1026 (num_pages - dirty_pages) <<
1027 PAGE_CACHE_SHIFT);
1028 }
1029
1030 if (copied > 0) {
988 dirty_and_release_pages(NULL, root, file, pages, 1031 dirty_and_release_pages(NULL, root, file, pages,
989 num_pages, pos, write_bytes); 1032 dirty_pages, pos, copied);
990 } 1033 }
991 1034
992 btrfs_drop_pages(pages, num_pages); 1035 btrfs_drop_pages(pages, num_pages);
993 if (ret) {
994 btrfs_delalloc_release_space(inode, write_bytes);
995 goto out;
996 }
997 1036
998 if (will_write) { 1037 if (copied > 0) {
999 filemap_fdatawrite_range(inode->i_mapping, pos, 1038 if (will_write) {
1000 pos + write_bytes - 1); 1039 filemap_fdatawrite_range(inode->i_mapping, pos,
1001 } else { 1040 pos + copied - 1);
1002 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1041 } else {
1003 num_pages); 1042 balance_dirty_pages_ratelimited_nr(
1004 if (num_pages < 1043 inode->i_mapping,
1005 (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1044 dirty_pages);
1006 btrfs_btree_balance_dirty(root, 1); 1045 if (dirty_pages <
1007 btrfs_throttle(root); 1046 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1047 btrfs_btree_balance_dirty(root, 1);
1048 btrfs_throttle(root);
1049 }
1008 } 1050 }
1009 1051
1010 pos += write_bytes; 1052 pos += copied;
1011 num_written += write_bytes; 1053 num_written += copied;
1012 1054
1013 cond_resched(); 1055 cond_resched();
1014 } 1056 }
@@ -1047,8 +1089,14 @@ out:
1047 1089
1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1090 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1049 trans = btrfs_start_transaction(root, 0); 1091 trans = btrfs_start_transaction(root, 0);
1092 if (IS_ERR(trans)) {
1093 num_written = PTR_ERR(trans);
1094 goto done;
1095 }
1096 mutex_lock(&inode->i_mutex);
1050 ret = btrfs_log_dentry_safe(trans, root, 1097 ret = btrfs_log_dentry_safe(trans, root,
1051 file->f_dentry); 1098 file->f_dentry);
1099 mutex_unlock(&inode->i_mutex);
1052 if (ret == 0) { 1100 if (ret == 0) {
1053 ret = btrfs_sync_log(trans, root); 1101 ret = btrfs_sync_log(trans, root);
1054 if (ret == 0) 1102 if (ret == 0)
@@ -1067,6 +1115,7 @@ out:
1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1115 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1068 } 1116 }
1069 } 1117 }
1118done:
1070 current->backing_dev_info = NULL; 1119 current->backing_dev_info = NULL;
1071 return num_written ? num_written : err; 1120 return num_written ? num_written : err;
1072} 1121}
@@ -1202,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1202 return 0; 1251 return 0;
1203} 1252}
1204 1253
1254static long btrfs_fallocate(struct file *file, int mode,
1255 loff_t offset, loff_t len)
1256{
1257 struct inode *inode = file->f_path.dentry->d_inode;
1258 struct extent_state *cached_state = NULL;
1259 u64 cur_offset;
1260 u64 last_byte;
1261 u64 alloc_start;
1262 u64 alloc_end;
1263 u64 alloc_hint = 0;
1264 u64 locked_end;
1265 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1266 struct extent_map *em;
1267 int ret;
1268
1269 alloc_start = offset & ~mask;
1270 alloc_end = (offset + len + mask) & ~mask;
1271
1272 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1273 if (mode & ~FALLOC_FL_KEEP_SIZE)
1274 return -EOPNOTSUPP;
1275
1276 /*
1277 * wait for ordered IO before we have any locks. We'll loop again
1278 * below with the locks held.
1279 */
1280 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1281
1282 mutex_lock(&inode->i_mutex);
1283 ret = inode_newsize_ok(inode, alloc_end);
1284 if (ret)
1285 goto out;
1286
1287 if (alloc_start > inode->i_size) {
1288 ret = btrfs_cont_expand(inode, alloc_start);
1289 if (ret)
1290 goto out;
1291 }
1292
1293 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1294 if (ret)
1295 goto out;
1296
1297 locked_end = alloc_end - 1;
1298 while (1) {
1299 struct btrfs_ordered_extent *ordered;
1300
1301 /* the extent lock is ordered inside the running
1302 * transaction
1303 */
1304 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1305 locked_end, 0, &cached_state, GFP_NOFS);
1306 ordered = btrfs_lookup_first_ordered_extent(inode,
1307 alloc_end - 1);
1308 if (ordered &&
1309 ordered->file_offset + ordered->len > alloc_start &&
1310 ordered->file_offset < alloc_end) {
1311 btrfs_put_ordered_extent(ordered);
1312 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1313 alloc_start, locked_end,
1314 &cached_state, GFP_NOFS);
1315 /*
1316 * we can't wait on the range with the transaction
1317 * running or with the extent lock held
1318 */
1319 btrfs_wait_ordered_range(inode, alloc_start,
1320 alloc_end - alloc_start);
1321 } else {
1322 if (ordered)
1323 btrfs_put_ordered_extent(ordered);
1324 break;
1325 }
1326 }
1327
1328 cur_offset = alloc_start;
1329 while (1) {
1330 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1331 alloc_end - cur_offset, 0);
1332 BUG_ON(IS_ERR(em) || !em);
1333 last_byte = min(extent_map_end(em), alloc_end);
1334 last_byte = (last_byte + mask) & ~mask;
1335 if (em->block_start == EXTENT_MAP_HOLE ||
1336 (cur_offset >= inode->i_size &&
1337 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1338 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1339 last_byte - cur_offset,
1340 1 << inode->i_blkbits,
1341 offset + len,
1342 &alloc_hint);
1343 if (ret < 0) {
1344 free_extent_map(em);
1345 break;
1346 }
1347 }
1348 free_extent_map(em);
1349
1350 cur_offset = last_byte;
1351 if (cur_offset >= alloc_end) {
1352 ret = 0;
1353 break;
1354 }
1355 }
1356 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1357 &cached_state, GFP_NOFS);
1358
1359 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1360out:
1361 mutex_unlock(&inode->i_mutex);
1362 return ret;
1363}
1364
1205const struct file_operations btrfs_file_operations = { 1365const struct file_operations btrfs_file_operations = {
1206 .llseek = generic_file_llseek, 1366 .llseek = generic_file_llseek,
1207 .read = do_sync_read, 1367 .read = do_sync_read,
@@ -1213,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
1213 .open = generic_file_open, 1373 .open = generic_file_open,
1214 .release = btrfs_release_file, 1374 .release = btrfs_release_file,
1215 .fsync = btrfs_sync_file, 1375 .fsync = btrfs_sync_file,
1376 .fallocate = btrfs_fallocate,
1216 .unlocked_ioctl = btrfs_ioctl, 1377 .unlocked_ioctl = btrfs_ioctl,
1217#ifdef CONFIG_COMPAT 1378#ifdef CONFIG_COMPAT
1218 .compat_ioctl = btrfs_ioctl, 1379 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b8..60d684266959 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
290 (unsigned long long)BTRFS_I(inode)->generation, 290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation, 291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid); 292 (unsigned long long)block_group->key.objectid);
293 goto out; 293 goto free_cache;
294 } 294 }
295 295
296 if (!num_entries) 296 if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
524 return 0; 524 return 0;
525 } 525 }
526 526
527 node = rb_first(&block_group->free_space_offset);
528 if (!node) {
529 iput(inode);
530 return 0;
531 }
532
527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
528 filemap_write_and_wait(inode->i_mapping); 534 filemap_write_and_wait(inode->i_mapping);
529 btrfs_wait_ordered_range(inode, inode->i_size & 535 btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
543 */ 549 */
544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
545 551
546 node = rb_first(&block_group->free_space_offset);
547 if (!node)
548 goto out_free;
549
550 /* 552 /*
551 * Lock all pages first so we can lock the extent safely. 553 * Lock all pages first so we can lock the extent safely.
552 * 554 *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa54..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 122 size_t cur_size = size;
123 size_t datasize; 123 size_t datasize;
124 unsigned long offset; 124 unsigned long offset;
125 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
126 126
127 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
128 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 129 cur_size = compressed_size;
130 } 130 }
131 131
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
161 161
162 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 163 struct page *cpage;
164 int i = 0; 164 int i = 0;
165 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 176 compressed_size -= cur_size;
177 } 177 }
178 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 179 compress_type);
180 } else { 180 } else {
181 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
263 u64 compressed_size; 263 u64 compressed_size;
264 struct page **pages; 264 struct page **pages;
265 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
266 struct list_head list; 267 struct list_head list;
267}; 268};
268 269
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
281 u64 compressed_size, 282 u64 compressed_size,
282 struct page **pages, 283 struct page **pages,
283 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
284{ 286{
285 struct async_extent *async_extent; 287 struct async_extent *async_extent;
286 288
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
290 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 293 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 297 return 0;
295} 298}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
338 int compress_type = root->fs_info->compress_type;
335 339
336 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
337again: 341again:
@@ -381,12 +385,16 @@ again:
381 WARN_ON(pages); 385 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
383 387
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
385 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
386 nr_pages, &nr_pages_ret, 390
387 &total_in, 391 ret = btrfs_compress_pages(compress_type,
388 &total_compressed, 392 inode->i_mapping, start,
389 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
390 398
391 if (!ret) { 399 if (!ret) {
392 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -493,9 +501,10 @@ again:
493 * and will submit them to the elevator. 501 * and will submit them to the elevator.
494 */ 502 */
495 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
497 506
498 if (start + num_bytes < end && start + num_bytes < actual_end) { 507 if (start + num_bytes < end) {
499 start += num_bytes; 508 start += num_bytes;
500 pages = NULL; 509 pages = NULL;
501 cond_resched(); 510 cond_resched();
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
517 } 526 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 529 *num_added += 1;
520 } 530 }
521 531
@@ -640,6 +650,7 @@ retry:
640 em->block_start = ins.objectid; 650 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 651 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 652 em->bdev = root->fs_info->fs_devices->latest_bdev;
653 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 654 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 656
@@ -656,11 +667,13 @@ retry:
656 async_extent->ram_size - 1, 0); 667 async_extent->ram_size - 1, 0);
657 } 668 }
658 669
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 670 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 671 async_extent->start,
661 async_extent->ram_size, 672 ins.objectid,
662 ins.offset, 673 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 674 ins.offset,
675 BTRFS_ORDERED_COMPRESSED,
676 async_extent->compress_type);
664 BUG_ON(ret); 677 BUG_ON(ret);
665 678
666 /* 679 /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1685 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1686 int compress_type = 0;
1674 int ret; 1687 int ret;
1675 bool nolock = false; 1688 bool nolock = false;
1676 1689
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1725
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1727 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1729 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1730 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1731 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1732 ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1740 ordered_extent->disk_len,
1728 ordered_extent->len, 1741 ordered_extent->len,
1729 ordered_extent->len, 1742 ordered_extent->len,
1730 compressed, 0, 0, 1743 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1744 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1746 ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1843 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1845 extent_set_compress_type(&failrec->bio_flags,
1846 em->compress_type);
1832 } 1847 }
1833 failrec->logical = logical; 1848 failrec->logical = logical;
1834 free_extent_map(em); 1849 free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3687{
3673 struct inode *inode = dentry->d_inode; 3688 struct inode *inode = dentry->d_inode;
3689 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3690 int err;
3675 3691
3692 if (btrfs_root_readonly(root))
3693 return -EROFS;
3694
3676 err = inode_change_ok(inode, attr); 3695 err = inode_change_ok(inode, attr);
3677 if (err) 3696 if (err)
3678 return err; 3697 return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4084 int index; 4103 int index;
4085 int ret; 4104 int ret;
4086 4105
4087 dentry->d_op = &btrfs_dentry_operations;
4088
4089 if (dentry->d_name.len > BTRFS_NAME_LEN) 4106 if (dentry->d_name.len > BTRFS_NAME_LEN)
4090 return ERR_PTR(-ENAMETOOLONG); 4107 return ERR_PTR(-ENAMETOOLONG);
4091 4108
@@ -4127,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4127 return inode; 4144 return inode;
4128} 4145}
4129 4146
4130static int btrfs_dentry_delete(struct dentry *dentry) 4147static int btrfs_dentry_delete(const struct dentry *dentry)
4131{ 4148{
4132 struct btrfs_root *root; 4149 struct btrfs_root *root;
4133 4150
@@ -4501,6 +4518,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4501 BTRFS_I(inode)->index_cnt = 2; 4518 BTRFS_I(inode)->index_cnt = 2;
4502 BTRFS_I(inode)->root = root; 4519 BTRFS_I(inode)->root = root;
4503 BTRFS_I(inode)->generation = trans->transid; 4520 BTRFS_I(inode)->generation = trans->transid;
4521 inode->i_generation = BTRFS_I(inode)->generation;
4504 btrfs_set_inode_space_info(root, inode); 4522 btrfs_set_inode_space_info(root, inode);
4505 4523
4506 if (mode & S_IFDIR) 4524 if (mode & S_IFDIR)
@@ -4622,12 +4640,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4622} 4640}
4623 4641
4624static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4642static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4625 struct dentry *dentry, struct inode *inode, 4643 struct inode *dir, struct dentry *dentry,
4626 int backref, u64 index) 4644 struct inode *inode, int backref, u64 index)
4627{ 4645{
4628 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4646 int err = btrfs_add_link(trans, dir, inode,
4629 inode, dentry->d_name.name, 4647 dentry->d_name.name, dentry->d_name.len,
4630 dentry->d_name.len, backref, index); 4648 backref, index);
4631 if (!err) { 4649 if (!err) {
4632 d_instantiate(dentry, inode); 4650 d_instantiate(dentry, inode);
4633 return 0; 4651 return 0;
@@ -4668,8 +4686,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4668 btrfs_set_trans_block_group(trans, dir); 4686 btrfs_set_trans_block_group(trans, dir);
4669 4687
4670 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4688 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4671 dentry->d_name.len, 4689 dentry->d_name.len, dir->i_ino, objectid,
4672 dentry->d_parent->d_inode->i_ino, objectid,
4673 BTRFS_I(dir)->block_group, mode, &index); 4690 BTRFS_I(dir)->block_group, mode, &index);
4674 err = PTR_ERR(inode); 4691 err = PTR_ERR(inode);
4675 if (IS_ERR(inode)) 4692 if (IS_ERR(inode))
@@ -4682,7 +4699,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4682 } 4699 }
4683 4700
4684 btrfs_set_trans_block_group(trans, inode); 4701 btrfs_set_trans_block_group(trans, inode);
4685 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4702 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4686 if (err) 4703 if (err)
4687 drop_inode = 1; 4704 drop_inode = 1;
4688 else { 4705 else {
@@ -4730,10 +4747,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4730 btrfs_set_trans_block_group(trans, dir); 4747 btrfs_set_trans_block_group(trans, dir);
4731 4748
4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4749 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4733 dentry->d_name.len, 4750 dentry->d_name.len, dir->i_ino, objectid,
4734 dentry->d_parent->d_inode->i_ino, 4751 BTRFS_I(dir)->block_group, mode, &index);
4735 objectid, BTRFS_I(dir)->block_group, mode,
4736 &index);
4737 err = PTR_ERR(inode); 4752 err = PTR_ERR(inode);
4738 if (IS_ERR(inode)) 4753 if (IS_ERR(inode))
4739 goto out_unlock; 4754 goto out_unlock;
@@ -4745,7 +4760,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4745 } 4760 }
4746 4761
4747 btrfs_set_trans_block_group(trans, inode); 4762 btrfs_set_trans_block_group(trans, inode);
4748 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4763 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4749 if (err) 4764 if (err)
4750 drop_inode = 1; 4765 drop_inode = 1;
4751 else { 4766 else {
@@ -4787,6 +4802,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4787 return -EPERM; 4802 return -EPERM;
4788 4803
4789 btrfs_inc_nlink(inode); 4804 btrfs_inc_nlink(inode);
4805 inode->i_ctime = CURRENT_TIME;
4790 4806
4791 err = btrfs_set_inode_index(dir, &index); 4807 err = btrfs_set_inode_index(dir, &index);
4792 if (err) 4808 if (err)
@@ -4805,15 +4821,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4805 btrfs_set_trans_block_group(trans, dir); 4821 btrfs_set_trans_block_group(trans, dir);
4806 ihold(inode); 4822 ihold(inode);
4807 4823
4808 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4824 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4809 4825
4810 if (err) { 4826 if (err) {
4811 drop_inode = 1; 4827 drop_inode = 1;
4812 } else { 4828 } else {
4829 struct dentry *parent = dget_parent(dentry);
4813 btrfs_update_inode_block_group(trans, dir); 4830 btrfs_update_inode_block_group(trans, dir);
4814 err = btrfs_update_inode(trans, root, inode); 4831 err = btrfs_update_inode(trans, root, inode);
4815 BUG_ON(err); 4832 BUG_ON(err);
4816 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4833 btrfs_log_new_name(trans, inode, NULL, parent);
4834 dput(parent);
4817 } 4835 }
4818 4836
4819 nr = trans->blocks_used; 4837 nr = trans->blocks_used;
@@ -4853,8 +4871,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4853 btrfs_set_trans_block_group(trans, dir); 4871 btrfs_set_trans_block_group(trans, dir);
4854 4872
4855 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4873 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4856 dentry->d_name.len, 4874 dentry->d_name.len, dir->i_ino, objectid,
4857 dentry->d_parent->d_inode->i_ino, objectid,
4858 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4875 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4859 &index); 4876 &index);
4860 if (IS_ERR(inode)) { 4877 if (IS_ERR(inode)) {
@@ -4877,9 +4894,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4877 if (err) 4894 if (err)
4878 goto out_fail; 4895 goto out_fail;
4879 4896
4880 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4897 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4881 inode, dentry->d_name.name, 4898 dentry->d_name.len, 0, index);
4882 dentry->d_name.len, 0, index);
4883 if (err) 4899 if (err)
4884 goto out_fail; 4900 goto out_fail;
4885 4901
@@ -4931,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4931 size_t max_size; 4947 size_t max_size;
4932 unsigned long inline_size; 4948 unsigned long inline_size;
4933 unsigned long ptr; 4949 unsigned long ptr;
4950 int compress_type;
4934 4951
4935 WARN_ON(pg_offset != 0); 4952 WARN_ON(pg_offset != 0);
4953 compress_type = btrfs_file_extent_compression(leaf, item);
4936 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4937 inline_size = btrfs_file_extent_inline_item_len(leaf, 4955 inline_size = btrfs_file_extent_inline_item_len(leaf,
4938 btrfs_item_nr(leaf, path->slots[0])); 4956 btrfs_item_nr(leaf, path->slots[0]));
@@ -4942,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4942 read_extent_buffer(leaf, tmp, ptr, inline_size); 4960 read_extent_buffer(leaf, tmp, ptr, inline_size);
4943 4961
4944 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4945 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4963 ret = btrfs_decompress(compress_type, tmp, page,
4946 inline_size, max_size); 4964 extent_offset, inline_size, max_size);
4947 if (ret) { 4965 if (ret) {
4948 char *kaddr = kmap_atomic(page, KM_USER0); 4966 char *kaddr = kmap_atomic(page, KM_USER0);
4949 unsigned long copy_size = min_t(u64, 4967 unsigned long copy_size = min_t(u64,
@@ -4985,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4985 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4986 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4987 struct btrfs_trans_handle *trans = NULL; 5005 struct btrfs_trans_handle *trans = NULL;
4988 int compressed; 5006 int compress_type;
4989 5007
4990again: 5008again:
4991 read_lock(&em_tree->lock); 5009 read_lock(&em_tree->lock);
@@ -5044,7 +5062,7 @@ again:
5044 5062
5045 found_type = btrfs_file_extent_type(leaf, item); 5063 found_type = btrfs_file_extent_type(leaf, item);
5046 extent_start = found_key.offset; 5064 extent_start = found_key.offset;
5047 compressed = btrfs_file_extent_compression(leaf, item); 5065 compress_type = btrfs_file_extent_compression(leaf, item);
5048 if (found_type == BTRFS_FILE_EXTENT_REG || 5066 if (found_type == BTRFS_FILE_EXTENT_REG ||
5049 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5050 extent_end = extent_start + 5068 extent_end = extent_start +
@@ -5090,8 +5108,9 @@ again:
5090 em->block_start = EXTENT_MAP_HOLE; 5108 em->block_start = EXTENT_MAP_HOLE;
5091 goto insert; 5109 goto insert;
5092 } 5110 }
5093 if (compressed) { 5111 if (compress_type != BTRFS_COMPRESS_NONE) {
5094 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5113 em->compress_type = compress_type;
5095 em->block_start = bytenr; 5114 em->block_start = bytenr;
5096 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5097 item); 5116 item);
@@ -5125,12 +5144,14 @@ again:
5125 em->len = (copy_size + root->sectorsize - 1) & 5144 em->len = (copy_size + root->sectorsize - 1) &
5126 ~((u64)root->sectorsize - 1); 5145 ~((u64)root->sectorsize - 1);
5127 em->orig_start = EXTENT_MAP_INLINE; 5146 em->orig_start = EXTENT_MAP_INLINE;
5128 if (compressed) 5147 if (compress_type) {
5129 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5149 em->compress_type = compress_type;
5150 }
5130 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5131 if (create == 0 && !PageUptodate(page)) { 5152 if (create == 0 && !PageUptodate(page)) {
5132 if (btrfs_file_extent_compression(leaf, item) == 5153 if (btrfs_file_extent_compression(leaf, item) !=
5133 BTRFS_COMPRESS_ZLIB) { 5154 BTRFS_COMPRESS_NONE) {
5134 ret = uncompress_inline(path, inode, page, 5155 ret = uncompress_inline(path, inode, page,
5135 pg_offset, 5156 pg_offset,
5136 extent_offset, item); 5157 extent_offset, item);
@@ -5535,13 +5556,21 @@ struct btrfs_dio_private {
5535 u64 bytes; 5556 u64 bytes;
5536 u32 *csums; 5557 u32 *csums;
5537 void *private; 5558 void *private;
5559
5560 /* number of bios pending for this dio */
5561 atomic_t pending_bios;
5562
5563 /* IO errors */
5564 int errors;
5565
5566 struct bio *orig_bio;
5538}; 5567};
5539 5568
5540static void btrfs_endio_direct_read(struct bio *bio, int err) 5569static void btrfs_endio_direct_read(struct bio *bio, int err)
5541{ 5570{
5571 struct btrfs_dio_private *dip = bio->bi_private;
5542 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5572 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5543 struct bio_vec *bvec = bio->bi_io_vec; 5573 struct bio_vec *bvec = bio->bi_io_vec;
5544 struct btrfs_dio_private *dip = bio->bi_private;
5545 struct inode *inode = dip->inode; 5574 struct inode *inode = dip->inode;
5546 struct btrfs_root *root = BTRFS_I(inode)->root; 5575 struct btrfs_root *root = BTRFS_I(inode)->root;
5547 u64 start; 5576 u64 start;
@@ -5595,15 +5624,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5595 struct btrfs_trans_handle *trans; 5624 struct btrfs_trans_handle *trans;
5596 struct btrfs_ordered_extent *ordered = NULL; 5625 struct btrfs_ordered_extent *ordered = NULL;
5597 struct extent_state *cached_state = NULL; 5626 struct extent_state *cached_state = NULL;
5627 u64 ordered_offset = dip->logical_offset;
5628 u64 ordered_bytes = dip->bytes;
5598 int ret; 5629 int ret;
5599 5630
5600 if (err) 5631 if (err)
5601 goto out_done; 5632 goto out_done;
5602 5633again:
5603 ret = btrfs_dec_test_ordered_pending(inode, &ordered, 5634 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5604 dip->logical_offset, dip->bytes); 5635 &ordered_offset,
5636 ordered_bytes);
5605 if (!ret) 5637 if (!ret)
5606 goto out_done; 5638 goto out_test;
5607 5639
5608 BUG_ON(!ordered); 5640 BUG_ON(!ordered);
5609 5641
@@ -5663,8 +5695,20 @@ out_unlock:
5663out: 5695out:
5664 btrfs_delalloc_release_metadata(inode, ordered->len); 5696 btrfs_delalloc_release_metadata(inode, ordered->len);
5665 btrfs_end_transaction(trans, root); 5697 btrfs_end_transaction(trans, root);
5698 ordered_offset = ordered->file_offset + ordered->len;
5666 btrfs_put_ordered_extent(ordered); 5699 btrfs_put_ordered_extent(ordered);
5667 btrfs_put_ordered_extent(ordered); 5700 btrfs_put_ordered_extent(ordered);
5701
5702out_test:
5703 /*
5704 * our bio might span multiple ordered extents. If we haven't
5705 * completed the accounting for the whole dio, go back and try again
5706 */
5707 if (ordered_offset < dip->logical_offset + dip->bytes) {
5708 ordered_bytes = dip->logical_offset + dip->bytes -
5709 ordered_offset;
5710 goto again;
5711 }
5668out_done: 5712out_done:
5669 bio->bi_private = dip->private; 5713 bio->bi_private = dip->private;
5670 5714
@@ -5684,6 +5728,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5684 return 0; 5728 return 0;
5685} 5729}
5686 5730
5731static void btrfs_end_dio_bio(struct bio *bio, int err)
5732{
5733 struct btrfs_dio_private *dip = bio->bi_private;
5734
5735 if (err) {
5736 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
5737 "sector %#Lx len %u err no %d\n",
5738 dip->inode->i_ino, bio->bi_rw,
5739 (unsigned long long)bio->bi_sector, bio->bi_size, err);
5740 dip->errors = 1;
5741
5742 /*
5743 * before atomic variable goto zero, we must make sure
5744 * dip->errors is perceived to be set.
5745 */
5746 smp_mb__before_atomic_dec();
5747 }
5748
5749 /* if there are more bios still pending for this dio, just exit */
5750 if (!atomic_dec_and_test(&dip->pending_bios))
5751 goto out;
5752
5753 if (dip->errors)
5754 bio_io_error(dip->orig_bio);
5755 else {
5756 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5757 bio_endio(dip->orig_bio, 0);
5758 }
5759out:
5760 bio_put(bio);
5761}
5762
5763static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5764 u64 first_sector, gfp_t gfp_flags)
5765{
5766 int nr_vecs = bio_get_nr_vecs(bdev);
5767 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5768}
5769
5770static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5771 int rw, u64 file_offset, int skip_sum,
5772 u32 *csums)
5773{
5774 int write = rw & REQ_WRITE;
5775 struct btrfs_root *root = BTRFS_I(inode)->root;
5776 int ret;
5777
5778 bio_get(bio);
5779 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5780 if (ret)
5781 goto err;
5782
5783 if (write && !skip_sum) {
5784 ret = btrfs_wq_submit_bio(root->fs_info,
5785 inode, rw, bio, 0, 0,
5786 file_offset,
5787 __btrfs_submit_bio_start_direct_io,
5788 __btrfs_submit_bio_done);
5789 goto err;
5790 } else if (!skip_sum)
5791 btrfs_lookup_bio_sums_dio(root, inode, bio,
5792 file_offset, csums);
5793
5794 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5795err:
5796 bio_put(bio);
5797 return ret;
5798}
5799
5800static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5801 int skip_sum)
5802{
5803 struct inode *inode = dip->inode;
5804 struct btrfs_root *root = BTRFS_I(inode)->root;
5805 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5806 struct bio *bio;
5807 struct bio *orig_bio = dip->orig_bio;
5808 struct bio_vec *bvec = orig_bio->bi_io_vec;
5809 u64 start_sector = orig_bio->bi_sector;
5810 u64 file_offset = dip->logical_offset;
5811 u64 submit_len = 0;
5812 u64 map_length;
5813 int nr_pages = 0;
5814 u32 *csums = dip->csums;
5815 int ret = 0;
5816
5817 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5818 if (!bio)
5819 return -ENOMEM;
5820 bio->bi_private = dip;
5821 bio->bi_end_io = btrfs_end_dio_bio;
5822 atomic_inc(&dip->pending_bios);
5823
5824 map_length = orig_bio->bi_size;
5825 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5826 &map_length, NULL, 0);
5827 if (ret) {
5828 bio_put(bio);
5829 return -EIO;
5830 }
5831
5832 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5833 if (unlikely(map_length < submit_len + bvec->bv_len ||
5834 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5835 bvec->bv_offset) < bvec->bv_len)) {
5836 /*
5837 * inc the count before we submit the bio so
5838 * we know the end IO handler won't happen before
5839 * we inc the count. Otherwise, the dip might get freed
5840 * before we're done setting it up
5841 */
5842 atomic_inc(&dip->pending_bios);
5843 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5844 file_offset, skip_sum,
5845 csums);
5846 if (ret) {
5847 bio_put(bio);
5848 atomic_dec(&dip->pending_bios);
5849 goto out_err;
5850 }
5851
5852 if (!skip_sum)
5853 csums = csums + nr_pages;
5854 start_sector += submit_len >> 9;
5855 file_offset += submit_len;
5856
5857 submit_len = 0;
5858 nr_pages = 0;
5859
5860 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
5861 start_sector, GFP_NOFS);
5862 if (!bio)
5863 goto out_err;
5864 bio->bi_private = dip;
5865 bio->bi_end_io = btrfs_end_dio_bio;
5866
5867 map_length = orig_bio->bi_size;
5868 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5869 &map_length, NULL, 0);
5870 if (ret) {
5871 bio_put(bio);
5872 goto out_err;
5873 }
5874 } else {
5875 submit_len += bvec->bv_len;
5876 nr_pages ++;
5877 bvec++;
5878 }
5879 }
5880
5881 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5882 csums);
5883 if (!ret)
5884 return 0;
5885
5886 bio_put(bio);
5887out_err:
5888 dip->errors = 1;
5889 /*
5890 * before atomic variable goto zero, we must
5891 * make sure dip->errors is perceived to be set.
5892 */
5893 smp_mb__before_atomic_dec();
5894 if (atomic_dec_and_test(&dip->pending_bios))
5895 bio_io_error(dip->orig_bio);
5896
5897 /* bio_end_io() will handle error, so we needn't return it */
5898 return 0;
5899}
5900
5687static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 5901static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5688 loff_t file_offset) 5902 loff_t file_offset)
5689{ 5903{
@@ -5723,36 +5937,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5723 5937
5724 dip->disk_bytenr = (u64)bio->bi_sector << 9; 5938 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5725 bio->bi_private = dip; 5939 bio->bi_private = dip;
5940 dip->errors = 0;
5941 dip->orig_bio = bio;
5942 atomic_set(&dip->pending_bios, 0);
5726 5943
5727 if (write) 5944 if (write)
5728 bio->bi_end_io = btrfs_endio_direct_write; 5945 bio->bi_end_io = btrfs_endio_direct_write;
5729 else 5946 else
5730 bio->bi_end_io = btrfs_endio_direct_read; 5947 bio->bi_end_io = btrfs_endio_direct_read;
5731 5948
5732 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5949 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
5733 if (ret) 5950 if (!ret)
5734 goto out_err;
5735
5736 if (write && !skip_sum) {
5737 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5738 inode, rw, bio, 0, 0,
5739 dip->logical_offset,
5740 __btrfs_submit_bio_start_direct_io,
5741 __btrfs_submit_bio_done);
5742 if (ret)
5743 goto out_err;
5744 return; 5951 return;
5745 } else if (!skip_sum)
5746 btrfs_lookup_bio_sums_dio(root, inode, bio,
5747 dip->logical_offset, dip->csums);
5748
5749 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5750 if (ret)
5751 goto out_err;
5752 return;
5753out_err:
5754 kfree(dip->csums);
5755 kfree(dip);
5756free_ordered: 5952free_ordered:
5757 /* 5953 /*
5758 * If this is a write, we need to clean up the reserved space and kill 5954 * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5956,7 @@ free_ordered:
5760 */ 5956 */
5761 if (write) { 5957 if (write) {
5762 struct btrfs_ordered_extent *ordered; 5958 struct btrfs_ordered_extent *ordered;
5763 ordered = btrfs_lookup_ordered_extent(inode, 5959 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
5764 dip->logical_offset);
5765 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 5960 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5766 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 5961 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5767 btrfs_free_reserved_extent(root, ordered->start, 5962 btrfs_free_reserved_extent(root, ordered->start,
@@ -6306,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6306 ei->ordered_data_close = 0; 6501 ei->ordered_data_close = 0;
6307 ei->orphan_meta_reserved = 0; 6502 ei->orphan_meta_reserved = 0;
6308 ei->dummy_inode = 0; 6503 ei->dummy_inode = 0;
6309 ei->force_compress = 0; 6504 ei->force_compress = BTRFS_COMPRESS_NONE;
6310 6505
6311 inode = &ei->vfs_inode; 6506 inode = &ei->vfs_inode;
6312 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -6322,6 +6517,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6322 return inode; 6517 return inode;
6323} 6518}
6324 6519
6520static void btrfs_i_callback(struct rcu_head *head)
6521{
6522 struct inode *inode = container_of(head, struct inode, i_rcu);
6523 INIT_LIST_HEAD(&inode->i_dentry);
6524 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6525}
6526
6325void btrfs_destroy_inode(struct inode *inode) 6527void btrfs_destroy_inode(struct inode *inode)
6326{ 6528{
6327 struct btrfs_ordered_extent *ordered; 6529 struct btrfs_ordered_extent *ordered;
@@ -6391,7 +6593,7 @@ void btrfs_destroy_inode(struct inode *inode)
6391 inode_tree_del(inode); 6593 inode_tree_del(inode);
6392 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6594 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
6393free: 6595free:
6394 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6596 call_rcu(&inode->i_rcu, btrfs_i_callback);
6395} 6597}
6396 6598
6397int btrfs_drop_inode(struct inode *inode) 6599int btrfs_drop_inode(struct inode *inode)
@@ -6607,8 +6809,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6607 BUG_ON(ret); 6809 BUG_ON(ret);
6608 6810
6609 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 6811 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
6610 btrfs_log_new_name(trans, old_inode, old_dir, 6812 struct dentry *parent = dget_parent(new_dentry);
6611 new_dentry->d_parent); 6813 btrfs_log_new_name(trans, old_inode, old_dir, parent);
6814 dput(parent);
6612 btrfs_end_log_trans(root); 6815 btrfs_end_log_trans(root);
6613 } 6816 }
6614out_fail: 6817out_fail:
@@ -6758,8 +6961,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6758 btrfs_set_trans_block_group(trans, dir); 6961 btrfs_set_trans_block_group(trans, dir);
6759 6962
6760 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6963 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6761 dentry->d_name.len, 6964 dentry->d_name.len, dir->i_ino, objectid,
6762 dentry->d_parent->d_inode->i_ino, objectid,
6763 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 6965 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
6764 &index); 6966 &index);
6765 err = PTR_ERR(inode); 6967 err = PTR_ERR(inode);
@@ -6773,7 +6975,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6773 } 6975 }
6774 6976
6775 btrfs_set_trans_block_group(trans, inode); 6977 btrfs_set_trans_block_group(trans, inode);
6776 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 6978 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6777 if (err) 6979 if (err)
6778 drop_inode = 1; 6980 drop_inode = 1;
6779 else { 6981 else {
@@ -6844,6 +7046,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6844 struct btrfs_root *root = BTRFS_I(inode)->root; 7046 struct btrfs_root *root = BTRFS_I(inode)->root;
6845 struct btrfs_key ins; 7047 struct btrfs_key ins;
6846 u64 cur_offset = start; 7048 u64 cur_offset = start;
7049 u64 i_size;
6847 int ret = 0; 7050 int ret = 0;
6848 bool own_trans = true; 7051 bool own_trans = true;
6849 7052
@@ -6885,11 +7088,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6885 (actual_len > inode->i_size) && 7088 (actual_len > inode->i_size) &&
6886 (cur_offset > inode->i_size)) { 7089 (cur_offset > inode->i_size)) {
6887 if (cur_offset > actual_len) 7090 if (cur_offset > actual_len)
6888 i_size_write(inode, actual_len); 7091 i_size = actual_len;
6889 else 7092 else
6890 i_size_write(inode, cur_offset); 7093 i_size = cur_offset;
6891 i_size_write(inode, cur_offset); 7094 i_size_write(inode, i_size);
6892 btrfs_ordered_update_i_size(inode, cur_offset, NULL); 7095 btrfs_ordered_update_i_size(inode, i_size, NULL);
6893 } 7096 }
6894 7097
6895 ret = btrfs_update_inode(trans, root, inode); 7098 ret = btrfs_update_inode(trans, root, inode);
@@ -6919,118 +7122,20 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
6919 min_size, actual_len, alloc_hint, trans); 7122 min_size, actual_len, alloc_hint, trans);
6920} 7123}
6921 7124
6922static long btrfs_fallocate(struct inode *inode, int mode,
6923 loff_t offset, loff_t len)
6924{
6925 struct extent_state *cached_state = NULL;
6926 u64 cur_offset;
6927 u64 last_byte;
6928 u64 alloc_start;
6929 u64 alloc_end;
6930 u64 alloc_hint = 0;
6931 u64 locked_end;
6932 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
6933 struct extent_map *em;
6934 int ret;
6935
6936 alloc_start = offset & ~mask;
6937 alloc_end = (offset + len + mask) & ~mask;
6938
6939 /*
6940 * wait for ordered IO before we have any locks. We'll loop again
6941 * below with the locks held.
6942 */
6943 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
6944
6945 mutex_lock(&inode->i_mutex);
6946 if (alloc_start > inode->i_size) {
6947 ret = btrfs_cont_expand(inode, alloc_start);
6948 if (ret)
6949 goto out;
6950 }
6951
6952 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
6953 if (ret)
6954 goto out;
6955
6956 locked_end = alloc_end - 1;
6957 while (1) {
6958 struct btrfs_ordered_extent *ordered;
6959
6960 /* the extent lock is ordered inside the running
6961 * transaction
6962 */
6963 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
6964 locked_end, 0, &cached_state, GFP_NOFS);
6965 ordered = btrfs_lookup_first_ordered_extent(inode,
6966 alloc_end - 1);
6967 if (ordered &&
6968 ordered->file_offset + ordered->len > alloc_start &&
6969 ordered->file_offset < alloc_end) {
6970 btrfs_put_ordered_extent(ordered);
6971 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
6972 alloc_start, locked_end,
6973 &cached_state, GFP_NOFS);
6974 /*
6975 * we can't wait on the range with the transaction
6976 * running or with the extent lock held
6977 */
6978 btrfs_wait_ordered_range(inode, alloc_start,
6979 alloc_end - alloc_start);
6980 } else {
6981 if (ordered)
6982 btrfs_put_ordered_extent(ordered);
6983 break;
6984 }
6985 }
6986
6987 cur_offset = alloc_start;
6988 while (1) {
6989 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
6990 alloc_end - cur_offset, 0);
6991 BUG_ON(IS_ERR(em) || !em);
6992 last_byte = min(extent_map_end(em), alloc_end);
6993 last_byte = (last_byte + mask) & ~mask;
6994 if (em->block_start == EXTENT_MAP_HOLE ||
6995 (cur_offset >= inode->i_size &&
6996 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6997 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
6998 last_byte - cur_offset,
6999 1 << inode->i_blkbits,
7000 offset + len,
7001 &alloc_hint);
7002 if (ret < 0) {
7003 free_extent_map(em);
7004 break;
7005 }
7006 }
7007 free_extent_map(em);
7008
7009 cur_offset = last_byte;
7010 if (cur_offset >= alloc_end) {
7011 ret = 0;
7012 break;
7013 }
7014 }
7015 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7016 &cached_state, GFP_NOFS);
7017
7018 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7019out:
7020 mutex_unlock(&inode->i_mutex);
7021 return ret;
7022}
7023
7024static int btrfs_set_page_dirty(struct page *page) 7125static int btrfs_set_page_dirty(struct page *page)
7025{ 7126{
7026 return __set_page_dirty_nobuffers(page); 7127 return __set_page_dirty_nobuffers(page);
7027} 7128}
7028 7129
7029static int btrfs_permission(struct inode *inode, int mask) 7130static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7030{ 7131{
7132 struct btrfs_root *root = BTRFS_I(inode)->root;
7133
7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7135 return -EROFS;
7031 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7032 return -EACCES; 7137 return -EACCES;
7033 return generic_permission(inode, mask, btrfs_check_acl); 7138 return generic_permission(inode, mask, flags, btrfs_check_acl);
7034} 7139}
7035 7140
7036static const struct inode_operations btrfs_dir_inode_operations = { 7141static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7123,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7123 .listxattr = btrfs_listxattr, 7228 .listxattr = btrfs_listxattr,
7124 .removexattr = btrfs_removexattr, 7229 .removexattr = btrfs_removexattr,
7125 .permission = btrfs_permission, 7230 .permission = btrfs_permission,
7126 .fallocate = btrfs_fallocate,
7127 .fiemap = btrfs_fiemap, 7231 .fiemap = btrfs_fiemap,
7128}; 7232};
7129static const struct inode_operations btrfs_special_inode_operations = { 7233static const struct inode_operations btrfs_special_inode_operations = {
@@ -7139,6 +7243,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7139 .readlink = generic_readlink, 7243 .readlink = generic_readlink,
7140 .follow_link = page_follow_link_light, 7244 .follow_link = page_follow_link_light,
7141 .put_link = page_put_link, 7245 .put_link = page_put_link,
7246 .getattr = btrfs_getattr,
7142 .permission = btrfs_permission, 7247 .permission = btrfs_permission,
7143 .setxattr = btrfs_setxattr, 7248 .setxattr = btrfs_setxattr,
7144 .getxattr = btrfs_getxattr, 7249 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3a..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
148 int ret; 148 int ret;
149 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
150 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 154 return -EFAULT;
152 155
@@ -233,7 +236,8 @@ static noinline int create_subvol(struct btrfs_root *root,
233 struct btrfs_inode_item *inode_item; 236 struct btrfs_inode_item *inode_item;
234 struct extent_buffer *leaf; 237 struct extent_buffer *leaf;
235 struct btrfs_root *new_root; 238 struct btrfs_root *new_root;
236 struct inode *dir = dentry->d_parent->d_inode; 239 struct dentry *parent = dget_parent(dentry);
240 struct inode *dir;
237 int ret; 241 int ret;
238 int err; 242 int err;
239 u64 objectid; 243 u64 objectid;
@@ -242,8 +246,13 @@ static noinline int create_subvol(struct btrfs_root *root,
242 246
243 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 247 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
244 0, &objectid); 248 0, &objectid);
245 if (ret) 249 if (ret) {
250 dput(parent);
246 return ret; 251 return ret;
252 }
253
254 dir = parent->d_inode;
255
247 /* 256 /*
248 * 1 - inode item 257 * 1 - inode item
249 * 2 - refs 258 * 2 - refs
@@ -251,8 +260,10 @@ static noinline int create_subvol(struct btrfs_root *root,
251 * 2 - dir items 260 * 2 - dir items
252 */ 261 */
253 trans = btrfs_start_transaction(root, 6); 262 trans = btrfs_start_transaction(root, 6);
254 if (IS_ERR(trans)) 263 if (IS_ERR(trans)) {
264 dput(parent);
255 return PTR_ERR(trans); 265 return PTR_ERR(trans);
266 }
256 267
257 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 268 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
258 0, objectid, NULL, 0, 0, 0); 269 0, objectid, NULL, 0, 0, 0);
@@ -339,6 +350,7 @@ static noinline int create_subvol(struct btrfs_root *root,
339 350
340 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 351 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
341fail: 352fail:
353 dput(parent);
342 if (async_transid) { 354 if (async_transid) {
343 *async_transid = trans->transid; 355 *async_transid = trans->transid;
344 err = btrfs_commit_transaction_async(trans, root, 1); 356 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -351,9 +363,11 @@ fail:
351} 363}
352 364
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
354 char *name, int namelen, u64 *async_transid) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
355{ 368{
356 struct inode *inode; 369 struct inode *inode;
370 struct dentry *parent;
357 struct btrfs_pending_snapshot *pending_snapshot; 371 struct btrfs_pending_snapshot *pending_snapshot;
358 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
359 int ret; 373 int ret;
@@ -368,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
368 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
369 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
370 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
371 386
372 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
373 if (IS_ERR(trans)) { 388 if (IS_ERR(trans)) {
@@ -396,7 +411,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
396 411
397 btrfs_orphan_cleanup(pending_snapshot->snap); 412 btrfs_orphan_cleanup(pending_snapshot->snap);
398 413
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 414 parent = dget_parent(dentry);
415 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
416 dput(parent);
400 if (IS_ERR(inode)) { 417 if (IS_ERR(inode)) {
401 ret = PTR_ERR(inode); 418 ret = PTR_ERR(inode);
402 goto fail; 419 goto fail;
@@ -497,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
497static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
498 char *name, int namelen, 515 char *name, int namelen,
499 struct btrfs_root *snap_src, 516 struct btrfs_root *snap_src,
500 u64 *async_transid) 517 u64 *async_transid, bool readonly)
501{ 518{
502 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
503 struct dentry *dentry; 520 struct dentry *dentry;
@@ -529,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
529 546
530 if (snap_src) { 547 if (snap_src) {
531 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
532 name, namelen, async_transid); 549 name, namelen, async_transid, readonly);
533 } else { 550 } else {
534 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
535 name, namelen, async_transid); 552 name, namelen, async_transid);
@@ -626,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
626 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
627 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
628 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
629 unsigned long last_index; 647 unsigned long last_index;
630 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
631 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
632 u64 page_start; 651 u64 page_start;
633 u64 page_end; 652 u64 page_end;
634 u64 last_len = 0; 653 u64 last_len = 0;
@@ -636,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
636 u64 defrag_end = 0; 655 u64 defrag_end = 0;
637 unsigned long i; 656 unsigned long i;
638 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
659
660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
639 666
640 if (inode->i_size == 0) 667 if (inode->i_size == 0)
641 return 0; 668 return 0;
@@ -671,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
671 total_read++; 698 total_read++;
672 mutex_lock(&inode->i_mutex); 699 mutex_lock(&inode->i_mutex);
673 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
674 BTRFS_I(inode)->force_compress = 1; 701 BTRFS_I(inode)->force_compress = compress_type;
675 702
676 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
677 if (ret) 704 if (ret)
@@ -769,10 +796,17 @@ loop_unlock:
769 atomic_dec(&root->fs_info->async_submit_draining); 796 atomic_dec(&root->fs_info->async_submit_draining);
770 797
771 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
772 BTRFS_I(inode)->force_compress = 0; 799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
773 mutex_unlock(&inode->i_mutex); 800 mutex_unlock(&inode->i_mutex);
774 } 801 }
775 802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
808 }
809
776 return 0; 810 return 0;
777 811
778err_reservations: 812err_reservations:
@@ -889,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
889 char *name, 923 char *name,
890 unsigned long fd, 924 unsigned long fd,
891 int subvol, 925 int subvol,
892 u64 *transid) 926 u64 *transid,
927 bool readonly)
893{ 928{
894 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 929 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
895 struct file *src_file; 930 struct file *src_file;
@@ -907,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
907 942
908 if (subvol) { 943 if (subvol) {
909 ret = btrfs_mksubvol(&file->f_path, name, namelen, 944 ret = btrfs_mksubvol(&file->f_path, name, namelen,
910 NULL, transid); 945 NULL, transid, readonly);
911 } else { 946 } else {
912 struct inode *src_inode; 947 struct inode *src_inode;
913 src_file = fget(fd); 948 src_file = fget(fd);
@@ -926,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
926 } 961 }
927 ret = btrfs_mksubvol(&file->f_path, name, namelen, 962 ret = btrfs_mksubvol(&file->f_path, name, namelen,
928 BTRFS_I(src_inode)->root, 963 BTRFS_I(src_inode)->root,
929 transid); 964 transid, readonly);
930 fput(src_file); 965 fput(src_file);
931 } 966 }
932out: 967out:
@@ -934,49 +969,142 @@ out:
934} 969}
935 970
936static noinline int btrfs_ioctl_snap_create(struct file *file, 971static noinline int btrfs_ioctl_snap_create(struct file *file,
937 void __user *arg, int subvol, 972 void __user *arg, int subvol)
938 int async)
939{ 973{
940 struct btrfs_ioctl_vol_args *vol_args = NULL; 974 struct btrfs_ioctl_vol_args *vol_args;
941 struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
942 char *name;
943 u64 fd;
944 u64 transid = 0;
945 int ret; 975 int ret;
946 976
947 if (async) { 977 vol_args = memdup_user(arg, sizeof(*vol_args));
948 async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); 978 if (IS_ERR(vol_args))
949 if (IS_ERR(async_vol_args)) 979 return PTR_ERR(vol_args);
950 return PTR_ERR(async_vol_args); 980 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
951 981
952 name = async_vol_args->name; 982 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
953 fd = async_vol_args->fd; 983 vol_args->fd, subvol,
954 async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; 984 NULL, false);
955 } else {
956 vol_args = memdup_user(arg, sizeof(*vol_args));
957 if (IS_ERR(vol_args))
958 return PTR_ERR(vol_args);
959 name = vol_args->name;
960 fd = vol_args->fd;
961 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 }
963 985
964 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 986 kfree(vol_args);
965 subvol, &transid); 987 return ret;
988}
966 989
967 if (!ret && async) { 990static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
968 if (copy_to_user(arg + 991 void __user *arg, int subvol)
969 offsetof(struct btrfs_ioctl_async_vol_args, 992{
970 transid), &transid, sizeof(transid))) 993 struct btrfs_ioctl_vol_args_v2 *vol_args;
971 return -EFAULT; 994 int ret;
995 u64 transid = 0;
996 u64 *ptr = NULL;
997 bool readonly = false;
998
999 vol_args = memdup_user(arg, sizeof(*vol_args));
1000 if (IS_ERR(vol_args))
1001 return PTR_ERR(vol_args);
1002 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1003
1004 if (vol_args->flags &
1005 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
1006 ret = -EOPNOTSUPP;
1007 goto out;
972 } 1008 }
973 1009
1010 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1011 ptr = &transid;
1012 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1013 readonly = true;
1014
1015 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1016 vol_args->fd, subvol,
1017 ptr, readonly);
1018
1019 if (ret == 0 && ptr &&
1020 copy_to_user(arg +
1021 offsetof(struct btrfs_ioctl_vol_args_v2,
1022 transid), ptr, sizeof(*ptr)))
1023 ret = -EFAULT;
1024out:
974 kfree(vol_args); 1025 kfree(vol_args);
975 kfree(async_vol_args); 1026 return ret;
1027}
1028
1029static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1030 void __user *arg)
1031{
1032 struct inode *inode = fdentry(file)->d_inode;
1033 struct btrfs_root *root = BTRFS_I(inode)->root;
1034 int ret = 0;
1035 u64 flags = 0;
1036
1037 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1038 return -EINVAL;
1039
1040 down_read(&root->fs_info->subvol_sem);
1041 if (btrfs_root_readonly(root))
1042 flags |= BTRFS_SUBVOL_RDONLY;
1043 up_read(&root->fs_info->subvol_sem);
1044
1045 if (copy_to_user(arg, &flags, sizeof(flags)))
1046 ret = -EFAULT;
976 1047
977 return ret; 1048 return ret;
978} 1049}
979 1050
1051static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1052 void __user *arg)
1053{
1054 struct inode *inode = fdentry(file)->d_inode;
1055 struct btrfs_root *root = BTRFS_I(inode)->root;
1056 struct btrfs_trans_handle *trans;
1057 u64 root_flags;
1058 u64 flags;
1059 int ret = 0;
1060
1061 if (root->fs_info->sb->s_flags & MS_RDONLY)
1062 return -EROFS;
1063
1064 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1065 return -EINVAL;
1066
1067 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT;
1069
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL;
1072
1073 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP;
1075
1076 down_write(&root->fs_info->subvol_sem);
1077
1078 /* nothing to do */
1079 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1080 goto out;
1081
1082 root_flags = btrfs_root_flags(&root->root_item);
1083 if (flags & BTRFS_SUBVOL_RDONLY)
1084 btrfs_set_root_flags(&root->root_item,
1085 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1086 else
1087 btrfs_set_root_flags(&root->root_item,
1088 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1089
1090 trans = btrfs_start_transaction(root, 1);
1091 if (IS_ERR(trans)) {
1092 ret = PTR_ERR(trans);
1093 goto out_reset;
1094 }
1095
1096 ret = btrfs_update_root(trans, root,
1097 &root->root_key, &root->root_item);
1098
1099 btrfs_commit_transaction(trans, root);
1100out_reset:
1101 if (ret)
1102 btrfs_set_root_flags(&root->root_item, root_flags);
1103out:
1104 up_write(&root->fs_info->subvol_sem);
1105 return ret;
1106}
1107
980/* 1108/*
981 * helper to check if the subvolume references other subvolumes 1109 * helper to check if the subvolume references other subvolumes
982 */ 1110 */
@@ -1485,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1485 struct btrfs_ioctl_defrag_range_args *range; 1613 struct btrfs_ioctl_defrag_range_args *range;
1486 int ret; 1614 int ret;
1487 1615
1616 if (btrfs_root_readonly(root))
1617 return -EROFS;
1618
1488 ret = mnt_want_write(file->f_path.mnt); 1619 ret = mnt_want_write(file->f_path.mnt);
1489 if (ret) 1620 if (ret)
1490 return ret; 1621 return ret;
@@ -1613,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1613 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1744 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1614 return -EINVAL; 1745 return -EINVAL;
1615 1746
1747 if (btrfs_root_readonly(root))
1748 return -EROFS;
1749
1616 ret = mnt_want_write(file->f_path.mnt); 1750 ret = mnt_want_write(file->f_path.mnt);
1617 if (ret) 1751 if (ret)
1618 return ret; 1752 return ret;
@@ -1669,12 +1803,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1669 olen = len = src->i_size - off; 1803 olen = len = src->i_size - off;
1670 /* if we extend to eof, continue to block boundary */ 1804 /* if we extend to eof, continue to block boundary */
1671 if (off + len == src->i_size) 1805 if (off + len == src->i_size)
1672 len = ((src->i_size + bs-1) & ~(bs-1)) 1806 len = ALIGN(src->i_size, bs) - off;
1673 - off;
1674 1807
1675 /* verify the end result is block aligned */ 1808 /* verify the end result is block aligned */
1676 if ((off & (bs-1)) || 1809 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
1677 ((off + len) & (bs-1))) 1810 !IS_ALIGNED(destoff, bs))
1678 goto out_unlock; 1811 goto out_unlock;
1679 1812
1680 /* do any pending delalloc/csum calc on src, one way or 1813 /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +2007,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1874 * but shouldn't round up the file size 2007 * but shouldn't round up the file size
1875 */ 2008 */
1876 endoff = new_key.offset + datal; 2009 endoff = new_key.offset + datal;
1877 if (endoff > off+olen) 2010 if (endoff > destoff+olen)
1878 endoff = off+olen; 2011 endoff = destoff+olen;
1879 if (endoff > inode->i_size) 2012 if (endoff > inode->i_size)
1880 btrfs_i_size_write(inode, endoff); 2013 btrfs_i_size_write(inode, endoff);
1881 2014
@@ -1935,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1935 if (file->private_data) 2068 if (file->private_data)
1936 goto out; 2069 goto out;
1937 2070
2071 ret = -EROFS;
2072 if (btrfs_root_readonly(root))
2073 goto out;
2074
1938 ret = mnt_want_write(file->f_path.mnt); 2075 ret = mnt_want_write(file->f_path.mnt);
1939 if (ret) 2076 if (ret)
1940 goto out; 2077 goto out;
@@ -2234,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
2234 case FS_IOC_GETVERSION: 2371 case FS_IOC_GETVERSION:
2235 return btrfs_ioctl_getversion(file, argp); 2372 return btrfs_ioctl_getversion(file, argp);
2236 case BTRFS_IOC_SNAP_CREATE: 2373 case BTRFS_IOC_SNAP_CREATE:
2237 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2374 return btrfs_ioctl_snap_create(file, argp, 0);
2238 case BTRFS_IOC_SNAP_CREATE_ASYNC: 2375 case BTRFS_IOC_SNAP_CREATE_V2:
2239 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2376 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2240 case BTRFS_IOC_SUBVOL_CREATE: 2377 case BTRFS_IOC_SUBVOL_CREATE:
2241 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2378 return btrfs_ioctl_snap_create(file, argp, 1);
2242 case BTRFS_IOC_SNAP_DESTROY: 2379 case BTRFS_IOC_SNAP_DESTROY:
2243 return btrfs_ioctl_snap_destroy(file, argp); 2380 return btrfs_ioctl_snap_destroy(file, argp);
2381 case BTRFS_IOC_SUBVOL_GETFLAGS:
2382 return btrfs_ioctl_subvol_getflags(file, argp);
2383 case BTRFS_IOC_SUBVOL_SETFLAGS:
2384 return btrfs_ioctl_subvol_setflags(file, argp);
2244 case BTRFS_IOC_DEFAULT_SUBVOL: 2385 case BTRFS_IOC_DEFAULT_SUBVOL:
2245 return btrfs_ioctl_default_subvol(file, argp); 2386 return btrfs_ioctl_default_subvol(file, argp);
2246 case BTRFS_IOC_DEFRAG: 2387 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf960..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,16 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SNAPSHOT_NAME_MAX 4079 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34struct btrfs_ioctl_async_vol_args { 34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35
36#define BTRFS_SUBVOL_NAME_MAX 4039
37struct btrfs_ioctl_vol_args_v2 {
35 __s64 fd; 38 __s64 fd;
36 __u64 transid; 39 __u64 transid;
37 char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; 40 __u64 flags;
41 __u64 unused[4];
42 char name[BTRFS_SUBVOL_NAME_MAX + 1];
38}; 43};
39 44
40#define BTRFS_INO_LOOKUP_PATH_MAX 4080 45#define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -129,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
129 */ 134 */
130 __u32 extent_thresh; 135 __u32 extent_thresh;
131 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
132 /* spare for later */ 144 /* spare for later */
133 __u32 unused[5]; 145 __u32 unused[4];
134}; 146};
135 147
136struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -187,6 +199,8 @@ struct btrfs_ioctl_space_args {
187 struct btrfs_ioctl_space_args) 199 struct btrfs_ioctl_space_args)
188#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) 200#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
189#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
190#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
191 struct btrfs_ioctl_async_vol_args) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
192#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca1..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 221 u64 start, u64 len, u64 disk_len, int type)
221{ 222{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
224} 226}
225 227
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 229 u64 start, u64 len, u64 disk_len, int type)
228{ 230{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
231} 243}
232 244
233/* 245/*
@@ -250,6 +262,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
250 262
251/* 263/*
252 * this is used to account for finished IO across a given range 264 * this is used to account for finished IO across a given range
265 * of the file. The IO may span ordered extents. If
266 * a given ordered_extent is completely done, 1 is returned, otherwise
267 * 0.
268 *
269 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
270 * to make sure this function only returns 1 once for a given ordered extent.
271 *
272 * file_offset is updated to one byte past the range that is recorded as
273 * complete. This allows you to walk forward in the file.
274 */
275int btrfs_dec_test_first_ordered_pending(struct inode *inode,
276 struct btrfs_ordered_extent **cached,
277 u64 *file_offset, u64 io_size)
278{
279 struct btrfs_ordered_inode_tree *tree;
280 struct rb_node *node;
281 struct btrfs_ordered_extent *entry = NULL;
282 int ret;
283 u64 dec_end;
284 u64 dec_start;
285 u64 to_dec;
286
287 tree = &BTRFS_I(inode)->ordered_tree;
288 spin_lock(&tree->lock);
289 node = tree_search(tree, *file_offset);
290 if (!node) {
291 ret = 1;
292 goto out;
293 }
294
295 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
296 if (!offset_in_entry(entry, *file_offset)) {
297 ret = 1;
298 goto out;
299 }
300
301 dec_start = max(*file_offset, entry->file_offset);
302 dec_end = min(*file_offset + io_size, entry->file_offset +
303 entry->len);
304 *file_offset = dec_end;
305 if (dec_start > dec_end) {
306 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
307 (unsigned long long)dec_start,
308 (unsigned long long)dec_end);
309 }
310 to_dec = dec_end - dec_start;
311 if (to_dec > entry->bytes_left) {
312 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
313 (unsigned long long)entry->bytes_left,
314 (unsigned long long)to_dec);
315 }
316 entry->bytes_left -= to_dec;
317 if (entry->bytes_left == 0)
318 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
319 else
320 ret = 1;
321out:
322 if (!ret && cached && entry) {
323 *cached = entry;
324 atomic_inc(&entry->refs);
325 }
326 spin_unlock(&tree->lock);
327 return ret == 0;
328}
329
330/*
331 * this is used to account for finished IO across a given range
253 * of the file. The IO should not span ordered extents. If 332 * of the file. The IO should not span ordered extents. If
254 * a given ordered_extent is completely done, 1 is returned, otherwise 333 * a given ordered_extent is completely done, 1 is returned, otherwise
255 * 0. 334 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3f..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -141,10 +144,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
141int btrfs_dec_test_ordered_pending(struct inode *inode, 144int btrfs_dec_test_ordered_pending(struct inode *inode,
142 struct btrfs_ordered_extent **cached, 145 struct btrfs_ordered_extent **cached,
143 u64 file_offset, u64 io_size); 146 u64 file_offset, u64 io_size);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size);
144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
145 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
148int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
149 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
150 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret) 59 if (ret < 0)
60 goto out; 60 goto out;
61 if (ret) {
62 ret = -ENOENT;
63 goto out;
64 }
61 65
62 ret = btrfs_del_item(trans, root, path); 66 ret = btrfs_del_item(trans, root, path);
63 67
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8f..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
54 54
55static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
56 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
57static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
58{ 142{
59 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
75}; 159};
76 160
77static match_table_t tokens = { 161static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 198 char *p, *num, *orig;
113 int intarg; 199 int intarg;
114 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
115 203
116 if (!options) 204 if (!options)
117 return 0; 205 return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 244 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 245 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
164 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
165 break; 271 break;
166 case Opt_ssd: 272 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -244,6 +350,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
244 case Opt_space_cache: 350 case Opt_space_cache:
245 printk(KERN_INFO "btrfs: enabling disk space caching\n"); 351 printk(KERN_INFO "btrfs: enabling disk space caching\n");
246 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 352 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
353 break;
247 case Opt_clear_cache: 354 case Opt_clear_cache:
248 printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 355 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
249 btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 356 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -459,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
459 sb->s_maxbytes = MAX_LFS_FILESIZE; 566 sb->s_maxbytes = MAX_LFS_FILESIZE;
460 sb->s_magic = BTRFS_SUPER_MAGIC; 567 sb->s_magic = BTRFS_SUPER_MAGIC;
461 sb->s_op = &btrfs_super_ops; 568 sb->s_op = &btrfs_super_ops;
569 sb->s_d_op = &btrfs_dentry_operations;
462 sb->s_export_op = &btrfs_export_ops; 570 sb->s_export_op = &btrfs_export_ops;
463 sb->s_xattr = btrfs_xattr_handlers; 571 sb->s_xattr = btrfs_xattr_handlers;
464 sb->s_time_gran = 1; 572 sb->s_time_gran = 1;
@@ -562,12 +670,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
562 670
563static int btrfs_test_super(struct super_block *s, void *data) 671static int btrfs_test_super(struct super_block *s, void *data)
564{ 672{
565 struct btrfs_fs_devices *test_fs_devices = data; 673 struct btrfs_root *test_root = data;
566 struct btrfs_root *root = btrfs_sb(s); 674 struct btrfs_root *root = btrfs_sb(s);
567 675
568 return root->fs_info->fs_devices == test_fs_devices; 676 /*
677 * If this super block is going away, return false as it
678 * can't match as an existing super block.
679 */
680 if (!atomic_read(&s->s_active))
681 return 0;
682 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
569} 683}
570 684
685static int btrfs_set_super(struct super_block *s, void *data)
686{
687 s->s_fs_info = data;
688
689 return set_anon_super(s, data);
690}
691
692
571/* 693/*
572 * Find a superblock for the given device / mount point. 694 * Find a superblock for the given device / mount point.
573 * 695 *
@@ -581,6 +703,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
581 struct super_block *s; 703 struct super_block *s;
582 struct dentry *root; 704 struct dentry *root;
583 struct btrfs_fs_devices *fs_devices = NULL; 705 struct btrfs_fs_devices *fs_devices = NULL;
706 struct btrfs_root *tree_root = NULL;
707 struct btrfs_fs_info *fs_info = NULL;
584 fmode_t mode = FMODE_READ; 708 fmode_t mode = FMODE_READ;
585 char *subvol_name = NULL; 709 char *subvol_name = NULL;
586 u64 subvol_objectid = 0; 710 u64 subvol_objectid = 0;
@@ -608,8 +732,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
608 goto error_close_devices; 732 goto error_close_devices;
609 } 733 }
610 734
735 /*
736 * Setup a dummy root and fs_info for test/set super. This is because
737 * we don't actually fill this stuff out until open_ctree, but we need
738 * it for searching for existing supers, so this lets us do that and
739 * then open_ctree will properly initialize everything later.
740 */
741 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
742 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
743 if (!fs_info || !tree_root) {
744 error = -ENOMEM;
745 goto error_close_devices;
746 }
747 fs_info->tree_root = tree_root;
748 fs_info->fs_devices = fs_devices;
749 tree_root->fs_info = fs_info;
750
611 bdev = fs_devices->latest_bdev; 751 bdev = fs_devices->latest_bdev;
612 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); 752 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
613 if (IS_ERR(s)) 753 if (IS_ERR(s))
614 goto error_s; 754 goto error_s;
615 755
@@ -652,9 +792,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
652 mutex_unlock(&root->d_inode->i_mutex); 792 mutex_unlock(&root->d_inode->i_mutex);
653 793
654 if (IS_ERR(new_root)) { 794 if (IS_ERR(new_root)) {
795 dput(root);
655 deactivate_locked_super(s); 796 deactivate_locked_super(s);
656 error = PTR_ERR(new_root); 797 error = PTR_ERR(new_root);
657 dput(root);
658 goto error_free_subvol_name; 798 goto error_free_subvol_name;
659 } 799 }
660 if (!new_root->d_inode) { 800 if (!new_root->d_inode) {
@@ -675,6 +815,8 @@ error_s:
675 error = PTR_ERR(s); 815 error = PTR_ERR(s);
676error_close_devices: 816error_close_devices:
677 btrfs_close_devices(fs_devices); 817 btrfs_close_devices(fs_devices);
818 kfree(fs_info);
819 kfree(tree_root);
678error_free_subvol_name: 820error_free_subvol_name:
679 kfree(subvol_name); 821 kfree(subvol_name);
680 return ERR_PTR(error); 822 return ERR_PTR(error);
@@ -717,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
717 return 0; 859 return 0;
718} 860}
719 861
862/*
863 * The helper to calc the free space on the devices that can be used to store
864 * file data.
865 */
866static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
867{
868 struct btrfs_fs_info *fs_info = root->fs_info;
869 struct btrfs_device_info *devices_info;
870 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
871 struct btrfs_device *device;
872 u64 skip_space;
873 u64 type;
874 u64 avail_space;
875 u64 used_space;
876 u64 min_stripe_size;
877 int min_stripes = 1;
878 int i = 0, nr_devices;
879 int ret;
880
881 nr_devices = fs_info->fs_devices->rw_devices;
882 BUG_ON(!nr_devices);
883
884 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
885 GFP_NOFS);
886 if (!devices_info)
887 return -ENOMEM;
888
889 /* calc min stripe number for data space alloction */
890 type = btrfs_get_alloc_profile(root, 1);
891 if (type & BTRFS_BLOCK_GROUP_RAID0)
892 min_stripes = 2;
893 else if (type & BTRFS_BLOCK_GROUP_RAID1)
894 min_stripes = 2;
895 else if (type & BTRFS_BLOCK_GROUP_RAID10)
896 min_stripes = 4;
897
898 if (type & BTRFS_BLOCK_GROUP_DUP)
899 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
900 else
901 min_stripe_size = BTRFS_STRIPE_LEN;
902
903 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
904 if (!device->in_fs_metadata)
905 continue;
906
907 avail_space = device->total_bytes - device->bytes_used;
908
909 /* align with stripe_len */
910 do_div(avail_space, BTRFS_STRIPE_LEN);
911 avail_space *= BTRFS_STRIPE_LEN;
912
913 /*
914 * In order to avoid overwritting the superblock on the drive,
915 * btrfs starts at an offset of at least 1MB when doing chunk
916 * allocation.
917 */
918 skip_space = 1024 * 1024;
919
920 /* user can set the offset in fs_info->alloc_start. */
921 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
922 device->total_bytes)
923 skip_space = max(fs_info->alloc_start, skip_space);
924
925 /*
926 * btrfs can not use the free space in [0, skip_space - 1],
927 * we must subtract it from the total. In order to implement
928 * it, we account the used space in this range first.
929 */
930 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
931 &used_space);
932 if (ret) {
933 kfree(devices_info);
934 return ret;
935 }
936
937 /* calc the free space in [0, skip_space - 1] */
938 skip_space -= used_space;
939
940 /*
941 * we can use the free space in [0, skip_space - 1], subtract
942 * it from the total.
943 */
944 if (avail_space && avail_space >= skip_space)
945 avail_space -= skip_space;
946 else
947 avail_space = 0;
948
949 if (avail_space < min_stripe_size)
950 continue;
951
952 devices_info[i].dev = device;
953 devices_info[i].max_avail = avail_space;
954
955 i++;
956 }
957
958 nr_devices = i;
959
960 btrfs_descending_sort_devices(devices_info, nr_devices);
961
962 i = nr_devices - 1;
963 avail_space = 0;
964 while (nr_devices >= min_stripes) {
965 if (devices_info[i].max_avail >= min_stripe_size) {
966 int j;
967 u64 alloc_size;
968
969 avail_space += devices_info[i].max_avail * min_stripes;
970 alloc_size = devices_info[i].max_avail;
971 for (j = i + 1 - min_stripes; j <= i; j++)
972 devices_info[j].max_avail -= alloc_size;
973 }
974 i--;
975 nr_devices--;
976 }
977
978 kfree(devices_info);
979 *free_bytes = avail_space;
980 return 0;
981}
982
720static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 983static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
721{ 984{
722 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 985 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -724,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
724 struct list_head *head = &root->fs_info->space_info; 987 struct list_head *head = &root->fs_info->space_info;
725 struct btrfs_space_info *found; 988 struct btrfs_space_info *found;
726 u64 total_used = 0; 989 u64 total_used = 0;
727 u64 total_used_data = 0; 990 u64 total_free_data = 0;
728 int bits = dentry->d_sb->s_blocksize_bits; 991 int bits = dentry->d_sb->s_blocksize_bits;
729 __be32 *fsid = (__be32 *)root->fs_info->fsid; 992 __be32 *fsid = (__be32 *)root->fs_info->fsid;
993 int ret;
730 994
995 /* holding chunk_muext to avoid allocating new chunks */
996 mutex_lock(&root->fs_info->chunk_mutex);
731 rcu_read_lock(); 997 rcu_read_lock();
732 list_for_each_entry_rcu(found, head, list) { 998 list_for_each_entry_rcu(found, head, list) {
733 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 999 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
734 BTRFS_BLOCK_GROUP_SYSTEM)) 1000 total_free_data += found->disk_total - found->disk_used;
735 total_used_data += found->disk_total; 1001 total_free_data -=
736 else 1002 btrfs_account_ro_block_groups_free_space(found);
737 total_used_data += found->disk_used; 1003 }
1004
738 total_used += found->disk_used; 1005 total_used += found->disk_used;
739 } 1006 }
740 rcu_read_unlock(); 1007 rcu_read_unlock();
@@ -742,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
742 buf->f_namelen = BTRFS_NAME_LEN; 1009 buf->f_namelen = BTRFS_NAME_LEN;
743 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1010 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
744 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1011 buf->f_bfree = buf->f_blocks - (total_used >> bits);
745 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
746 buf->f_bsize = dentry->d_sb->s_blocksize; 1012 buf->f_bsize = dentry->d_sb->s_blocksize;
747 buf->f_type = BTRFS_SUPER_MAGIC; 1013 buf->f_type = BTRFS_SUPER_MAGIC;
1014 buf->f_bavail = total_free_data;
1015 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1016 if (ret) {
1017 mutex_unlock(&root->fs_info->chunk_mutex);
1018 return ret;
1019 }
1020 buf->f_bavail += total_free_data;
1021 buf->f_bavail = buf->f_bavail >> bits;
1022 mutex_unlock(&root->fs_info->chunk_mutex);
748 1023
749 /* We treat it as constant endianness (it doesn't matter _which_) 1024 /* We treat it as constant endianness (it doesn't matter _which_)
750 because we want the fsid to come out the same whether mounted 1025 because we want the fsid to come out the same whether mounted
@@ -861,10 +1136,14 @@ static int __init init_btrfs_fs(void)
861 if (err) 1136 if (err)
862 return err; 1137 return err;
863 1138
864 err = btrfs_init_cachep(); 1139 err = btrfs_init_compress();
865 if (err) 1140 if (err)
866 goto free_sysfs; 1141 goto free_sysfs;
867 1142
1143 err = btrfs_init_cachep();
1144 if (err)
1145 goto free_compress;
1146
868 err = extent_io_init(); 1147 err = extent_io_init();
869 if (err) 1148 if (err)
870 goto free_cachep; 1149 goto free_cachep;
@@ -892,6 +1171,8 @@ free_extent_io:
892 extent_io_exit(); 1171 extent_io_exit();
893free_cachep: 1172free_cachep:
894 btrfs_destroy_cachep(); 1173 btrfs_destroy_cachep();
1174free_compress:
1175 btrfs_exit_compress();
895free_sysfs: 1176free_sysfs:
896 btrfs_exit_sysfs(); 1177 btrfs_exit_sysfs();
897 return err; 1178 return err;
@@ -906,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
906 unregister_filesystem(&btrfs_fs_type); 1187 unregister_filesystem(&btrfs_fs_type);
907 btrfs_exit_sysfs(); 1188 btrfs_exit_sysfs();
908 btrfs_cleanup_fs_uuids(); 1189 btrfs_cleanup_fs_uuids();
909 btrfs_zlib_exit(); 1190 btrfs_exit_compress();
910} 1191}
911 1192
912module_init(init_btrfs_fs) 1193module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bdf..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -902,6 +905,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
902 struct btrfs_root *root = pending->root; 905 struct btrfs_root *root = pending->root;
903 struct btrfs_root *parent_root; 906 struct btrfs_root *parent_root;
904 struct inode *parent_inode; 907 struct inode *parent_inode;
908 struct dentry *parent;
905 struct dentry *dentry; 909 struct dentry *dentry;
906 struct extent_buffer *tmp; 910 struct extent_buffer *tmp;
907 struct extent_buffer *old; 911 struct extent_buffer *old;
@@ -909,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
909 u64 to_reserve = 0; 913 u64 to_reserve = 0;
910 u64 index = 0; 914 u64 index = 0;
911 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
912 917
913 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
914 if (!new_root_item) { 919 if (!new_root_item) {
@@ -941,7 +946,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
941 trans->block_rsv = &pending->block_rsv; 946 trans->block_rsv = &pending->block_rsv;
942 947
943 dentry = pending->dentry; 948 dentry = pending->dentry;
944 parent_inode = dentry->d_parent->d_inode; 949 parent = dget_parent(dentry);
950 parent_inode = parent->d_inode;
945 parent_root = BTRFS_I(parent_inode)->root; 951 parent_root = BTRFS_I(parent_inode)->root;
946 record_root_in_trans(trans, parent_root); 952 record_root_in_trans(trans, parent_root);
947 953
@@ -965,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
965 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
966 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
967 973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
980
968 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
969 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
970 btrfs_set_lock_blocking(old); 983 btrfs_set_lock_blocking(old);
@@ -989,6 +1002,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
989 parent_inode->i_ino, index, 1002 parent_inode->i_ino, index,
990 dentry->d_name.name, dentry->d_name.len); 1003 dentry->d_name.name, dentry->d_name.len);
991 BUG_ON(ret); 1004 BUG_ON(ret);
1005 dput(parent);
992 1006
993 key.offset = (u64)-1; 1007 key.offset = (u64)-1;
994 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1008 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a27..054744ac5719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2869{ 2869{
2870 int ret = 0; 2870 int ret = 0;
2871 struct btrfs_root *root; 2871 struct btrfs_root *root;
2872 struct dentry *old_parent = NULL;
2872 2873
2873 /* 2874 /*
2874 * for regular files, if its inode is already on disk, we don't 2875 * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2910 if (IS_ROOT(parent)) 2911 if (IS_ROOT(parent))
2911 break; 2912 break;
2912 2913
2913 parent = parent->d_parent; 2914 parent = dget_parent(parent);
2915 dput(old_parent);
2916 old_parent = parent;
2914 inode = parent->d_inode; 2917 inode = parent->d_inode;
2915 2918
2916 } 2919 }
2920 dput(old_parent);
2917out: 2921out:
2918 return ret; 2922 return ret;
2919} 2923}
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2945{ 2949{
2946 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2950 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2947 struct super_block *sb; 2951 struct super_block *sb;
2952 struct dentry *old_parent = NULL;
2948 int ret = 0; 2953 int ret = 0;
2949 u64 last_committed = root->fs_info->last_trans_committed; 2954 u64 last_committed = root->fs_info->last_trans_committed;
2950 2955
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3016 if (IS_ROOT(parent)) 3021 if (IS_ROOT(parent))
3017 break; 3022 break;
3018 3023
3019 parent = parent->d_parent; 3024 parent = dget_parent(parent);
3025 dput(old_parent);
3026 old_parent = parent;
3020 } 3027 }
3021 ret = 0; 3028 ret = 0;
3022end_trans: 3029end_trans:
3030 dput(old_parent);
3023 if (ret < 0) { 3031 if (ret < 0) {
3024 BUG_ON(ret != -ENOSPC); 3032 BUG_ON(ret != -ENOSPC);
3025 root->fs_info->last_trans_log_full_commit = trans->transid; 3033 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
3039int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3047int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3040 struct btrfs_root *root, struct dentry *dentry) 3048 struct btrfs_root *root, struct dentry *dentry)
3041{ 3049{
3042 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 3050 struct dentry *parent = dget_parent(dentry);
3043 dentry->d_parent, 0); 3051 int ret;
3052
3053 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3054 dput(parent);
3055
3056 return ret;
3044} 3057}
3045 3058
3046/* 3059/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d6..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -412,12 +413,16 @@ static noinline int device_list_add(const char *path,
412 413
413 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
414 fs_devices->num_devices++; 415 fs_devices->num_devices++;
415 } else if (strcmp(device->name, path)) { 416 } else if (!device->name || strcmp(device->name, path)) {
416 name = kstrdup(path, GFP_NOFS); 417 name = kstrdup(path, GFP_NOFS);
417 if (!name) 418 if (!name)
418 return -ENOMEM; 419 return -ENOMEM;
419 kfree(device->name); 420 kfree(device->name);
420 device->name = name; 421 device->name = name;
422 if (device->missing) {
423 fs_devices->missing_devices--;
424 device->missing = 0;
425 }
421 } 426 }
422 427
423 if (found_transid > fs_devices->latest_trans) { 428 if (found_transid > fs_devices->latest_trans) {
@@ -489,7 +494,7 @@ again:
489 continue; 494 continue;
490 495
491 if (device->bdev) { 496 if (device->bdev) {
492 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
493 device->bdev = NULL; 498 device->bdev = NULL;
494 fs_devices->open_devices--; 499 fs_devices->open_devices--;
495 } 500 }
@@ -523,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
523 528
524 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
525 if (device->bdev) { 530 if (device->bdev) {
526 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
527 fs_devices->open_devices--; 532 fs_devices->open_devices--;
528 } 533 }
529 if (device->writeable) { 534 if (device->writeable) {
@@ -580,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
580 int seeding = 1; 585 int seeding = 1;
581 int ret = 0; 586 int ret = 0;
582 587
588 flags |= FMODE_EXCL;
589
583 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
584 if (device->bdev) 591 if (device->bdev)
585 continue; 592 continue;
586 if (!device->name) 593 if (!device->name)
587 continue; 594 continue;
588 595
589 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
590 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
591 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
592 goto error; 599 goto error;
@@ -594,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
594 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
595 602
596 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
597 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
598 goto error_close; 606 goto error_close;
607 }
599 608
600 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
601 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -638,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
638error_brelse: 647error_brelse:
639 brelse(bh); 648 brelse(bh);
640error_close: 649error_close:
641 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
642error: 651error:
643 continue; 652 continue;
644 } 653 }
@@ -684,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
684 693
685 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
686 695
687 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
688 698
689 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
690 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -696,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
696 goto error_close; 706 goto error_close;
697 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
698 if (!bh) { 708 if (!bh) {
699 ret = -EIO; 709 ret = -EINVAL;
700 goto error_close; 710 goto error_close;
701 } 711 }
702 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -716,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 726
717 brelse(bh); 727 brelse(bh);
718error_close: 728error_close:
719 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
720error: 730error:
721 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
722 return ret; 732 return ret;
723} 733}
724 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
725/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
726 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
727 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
728 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
729 */ 839 */
730int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
731 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
732 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
733{ 843{
734 struct btrfs_key key; 844 struct btrfs_key key;
735 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
736 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
737 struct btrfs_path *path; 847 struct btrfs_path *path;
738 u64 hole_size = 0; 848 u64 hole_size;
739 u64 last_byte = 0; 849 u64 max_hole_start;
740 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
741 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
742 int ret; 854 int ret;
743 int slot = 0; 855 int slot;
744 int start_found;
745 struct extent_buffer *l; 856 struct extent_buffer *l;
746 857
747 path = btrfs_alloc_path();
748 if (!path)
749 return -ENOMEM;
750 path->reada = 2;
751 start_found = 0;
752
753 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
754 859
755 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
756 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
757 */ 862 */
758 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
759 864
760 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
761 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
762 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
763 key.objectid = device->devid; 883 key.objectid = device->devid;
764 key.offset = search_start; 884 key.offset = search_start;
765 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
766 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
767 if (ret < 0) 888 if (ret < 0)
768 goto error; 889 goto out;
769 if (ret > 0) { 890 if (ret > 0) {
770 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
771 if (ret < 0) 892 if (ret < 0)
772 goto error; 893 goto out;
773 if (ret > 0)
774 start_found = 1;
775 } 894 }
776 l = path->nodes[0]; 895
777 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
778 while (1) { 896 while (1) {
779 l = path->nodes[0]; 897 l = path->nodes[0];
780 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -783,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
783 if (ret == 0) 901 if (ret == 0)
784 continue; 902 continue;
785 if (ret < 0) 903 if (ret < 0)
786 goto error; 904 goto out;
787no_more_items: 905
788 if (!start_found) { 906 break;
789 if (search_start >= search_end) {
790 ret = -ENOSPC;
791 goto error;
792 }
793 *start = search_start;
794 start_found = 1;
795 goto check_pending;
796 }
797 *start = last_byte > search_start ?
798 last_byte : search_start;
799 if (search_end <= *start) {
800 ret = -ENOSPC;
801 goto error;
802 }
803 goto check_pending;
804 } 907 }
805 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
806 909
@@ -808,48 +911,62 @@ no_more_items:
808 goto next; 911 goto next;
809 912
810 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
811 goto no_more_items; 914 break;
812 915
813 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
814 start_found) { 917 goto next;
815 if (last_byte < search_start)
816 last_byte = search_start;
817 hole_size = key.offset - last_byte;
818 918
819 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
820 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
921
922 if (hole_size > max_hole_size) {
923 max_hole_start = search_start;
924 max_hole_size = hole_size;
925 }
821 926
822 if (key.offset > last_byte && 927 /*
823 hole_size >= num_bytes) { 928 * If this free space is greater than which we need,
824 *start = last_byte; 929 * it must be the max free space that we have found
825 goto check_pending; 930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
826 } 939 }
827 } 940 }
828 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
829 goto next;
830 941
831 start_found = 1;
832 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
833 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
834next: 947next:
835 path->slots[0]++; 948 path->slots[0]++;
836 cond_resched(); 949 cond_resched();
837 } 950 }
838check_pending:
839 /* we have to make sure we didn't find an extent that has already
840 * been allocated by the map tree or the original allocation
841 */
842 BUG_ON(*start < search_start);
843 951
844 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
845 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
846 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
847 } 956 }
848 /* check for pending inserts here */
849 ret = 0;
850 957
851error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
852 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
853 return ret; 970 return ret;
854} 971}
855 972
@@ -1179,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1179 goto out; 1296 goto out;
1180 } 1297 }
1181 } else { 1298 } else {
1182 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1299 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1183 root->fs_info->bdev_holder); 1300 root->fs_info->bdev_holder);
1184 if (IS_ERR(bdev)) { 1301 if (IS_ERR(bdev)) {
1185 ret = PTR_ERR(bdev); 1302 ret = PTR_ERR(bdev);
1186 goto out; 1303 goto out;
@@ -1189,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1189 set_blocksize(bdev, 4096); 1306 set_blocksize(bdev, 4096);
1190 bh = btrfs_read_dev_super(bdev); 1307 bh = btrfs_read_dev_super(bdev);
1191 if (!bh) { 1308 if (!bh) {
1192 ret = -EIO; 1309 ret = -EINVAL;
1193 goto error_close; 1310 goto error_close;
1194 } 1311 }
1195 disk_super = (struct btrfs_super_block *)bh->b_data; 1312 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1236,6 +1353,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1236 1353
1237 device->fs_devices->num_devices--; 1354 device->fs_devices->num_devices--;
1238 1355
1356 if (device->missing)
1357 root->fs_info->fs_devices->missing_devices--;
1358
1239 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1359 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1240 struct btrfs_device, dev_list); 1360 struct btrfs_device, dev_list);
1241 if (device->bdev == root->fs_info->sb->s_bdev) 1361 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1244,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1244 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1364 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1245 1365
1246 if (device->bdev) { 1366 if (device->bdev) {
1247 close_bdev_exclusive(device->bdev, device->mode); 1367 blkdev_put(device->bdev, device->mode);
1248 device->bdev = NULL; 1368 device->bdev = NULL;
1249 device->fs_devices->open_devices--; 1369 device->fs_devices->open_devices--;
1250 } 1370 }
@@ -1287,7 +1407,7 @@ error_brelse:
1287 brelse(bh); 1407 brelse(bh);
1288error_close: 1408error_close:
1289 if (bdev) 1409 if (bdev)
1290 close_bdev_exclusive(bdev, FMODE_READ); 1410 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1291out: 1411out:
1292 mutex_unlock(&root->fs_info->volume_mutex); 1412 mutex_unlock(&root->fs_info->volume_mutex);
1293 mutex_unlock(&uuid_mutex); 1413 mutex_unlock(&uuid_mutex);
@@ -1439,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1439 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1559 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1440 return -EINVAL; 1560 return -EINVAL;
1441 1561
1442 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1562 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1563 root->fs_info->bdev_holder);
1443 if (IS_ERR(bdev)) 1564 if (IS_ERR(bdev))
1444 return PTR_ERR(bdev); 1565 return PTR_ERR(bdev);
1445 1566
@@ -1565,7 +1686,7 @@ out:
1565 mutex_unlock(&root->fs_info->volume_mutex); 1686 mutex_unlock(&root->fs_info->volume_mutex);
1566 return ret; 1687 return ret;
1567error: 1688error:
1568 close_bdev_exclusive(bdev, 0); 1689 blkdev_put(bdev, FMODE_EXCL);
1569 if (seeding_dev) { 1690 if (seeding_dev) {
1570 mutex_unlock(&uuid_mutex); 1691 mutex_unlock(&uuid_mutex);
1571 up_write(&sb->s_umount); 1692 up_write(&sb->s_umount);
@@ -1905,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1905 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2026 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1906 return -EROFS; 2027 return -EROFS;
1907 2028
2029 if (!capable(CAP_SYS_ADMIN))
2030 return -EPERM;
2031
1908 mutex_lock(&dev_root->fs_info->volume_mutex); 2032 mutex_lock(&dev_root->fs_info->volume_mutex);
1909 dev_root = dev_root->fs_info->dev_root; 2033 dev_root = dev_root->fs_info->dev_root;
1910 2034
@@ -2143,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2143 return calc_size * num_stripes; 2267 return calc_size * num_stripes;
2144} 2268}
2145 2269
2146static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2270/* Used to sort the devices by max_avail(descending sort) */
2147 struct btrfs_root *extent_root, 2271int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2148 struct map_lookup **map_ret,
2149 u64 *num_bytes, u64 *stripe_size,
2150 u64 start, u64 type)
2151{ 2272{
2152 struct btrfs_fs_info *info = extent_root->fs_info; 2273 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2153 struct btrfs_device *device = NULL; 2274 ((struct btrfs_device_info *)dev_info2)->max_avail)
2154 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2275 return -1;
2155 struct list_head *cur; 2276 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2156 struct map_lookup *map = NULL; 2277 ((struct btrfs_device_info *)dev_info2)->max_avail)
2157 struct extent_map_tree *em_tree; 2278 return 1;
2158 struct extent_map *em; 2279 else
2159 struct list_head private_devs; 2280 return 0;
2160 int min_stripe_size = 1 * 1024 * 1024; 2281}
2161 u64 calc_size = 1024 * 1024 * 1024;
2162 u64 max_chunk_size = calc_size;
2163 u64 min_free;
2164 u64 avail;
2165 u64 max_avail = 0;
2166 u64 dev_offset;
2167 int num_stripes = 1;
2168 int min_stripes = 1;
2169 int sub_stripes = 0;
2170 int looped = 0;
2171 int ret;
2172 int index;
2173 int stripe_len = 64 * 1024;
2174 2282
2175 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2283static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2176 (type & BTRFS_BLOCK_GROUP_DUP)) { 2284 int *num_stripes, int *min_stripes,
2177 WARN_ON(1); 2285 int *sub_stripes)
2178 type &= ~BTRFS_BLOCK_GROUP_DUP; 2286{
2179 } 2287 *num_stripes = 1;
2180 if (list_empty(&fs_devices->alloc_list)) 2288 *min_stripes = 1;
2181 return -ENOSPC; 2289 *sub_stripes = 0;
2182 2290
2183 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2291 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2184 num_stripes = fs_devices->rw_devices; 2292 *num_stripes = fs_devices->rw_devices;
2185 min_stripes = 2; 2293 *min_stripes = 2;
2186 } 2294 }
2187 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2295 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2188 num_stripes = 2; 2296 *num_stripes = 2;
2189 min_stripes = 2; 2297 *min_stripes = 2;
2190 } 2298 }
2191 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2299 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2192 if (fs_devices->rw_devices < 2) 2300 if (fs_devices->rw_devices < 2)
2193 return -ENOSPC; 2301 return -ENOSPC;
2194 num_stripes = 2; 2302 *num_stripes = 2;
2195 min_stripes = 2; 2303 *min_stripes = 2;
2196 } 2304 }
2197 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2305 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2198 num_stripes = fs_devices->rw_devices; 2306 *num_stripes = fs_devices->rw_devices;
2199 if (num_stripes < 4) 2307 if (*num_stripes < 4)
2200 return -ENOSPC; 2308 return -ENOSPC;
2201 num_stripes &= ~(u32)1; 2309 *num_stripes &= ~(u32)1;
2202 sub_stripes = 2; 2310 *sub_stripes = 2;
2203 min_stripes = 4; 2311 *min_stripes = 4;
2204 } 2312 }
2205 2313
2314 return 0;
2315}
2316
2317static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2318 u64 proposed_size, u64 type,
2319 int num_stripes, int small_stripe)
2320{
2321 int min_stripe_size = 1 * 1024 * 1024;
2322 u64 calc_size = proposed_size;
2323 u64 max_chunk_size = calc_size;
2324 int ncopies = 1;
2325
2326 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2327 BTRFS_BLOCK_GROUP_DUP |
2328 BTRFS_BLOCK_GROUP_RAID10))
2329 ncopies = 2;
2330
2206 if (type & BTRFS_BLOCK_GROUP_DATA) { 2331 if (type & BTRFS_BLOCK_GROUP_DATA) {
2207 max_chunk_size = 10 * calc_size; 2332 max_chunk_size = 10 * calc_size;
2208 min_stripe_size = 64 * 1024 * 1024; 2333 min_stripe_size = 64 * 1024 * 1024;
@@ -2219,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2219 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2344 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2220 max_chunk_size); 2345 max_chunk_size);
2221 2346
2222again: 2347 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2223 max_avail = 0; 2348 calc_size = max_chunk_size * ncopies;
2224 if (!map || map->num_stripes != num_stripes) {
2225 kfree(map);
2226 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2227 if (!map)
2228 return -ENOMEM;
2229 map->num_stripes = num_stripes;
2230 }
2231
2232 if (calc_size * num_stripes > max_chunk_size) {
2233 calc_size = max_chunk_size;
2234 do_div(calc_size, num_stripes); 2349 do_div(calc_size, num_stripes);
2235 do_div(calc_size, stripe_len); 2350 do_div(calc_size, BTRFS_STRIPE_LEN);
2236 calc_size *= stripe_len; 2351 calc_size *= BTRFS_STRIPE_LEN;
2237 } 2352 }
2238 2353
2239 /* we don't want tiny stripes */ 2354 /* we don't want tiny stripes */
2240 if (!looped) 2355 if (!small_stripe)
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2356 calc_size = max_t(u64, min_stripe_size, calc_size);
2242 2357
2243 /* 2358 /*
2244 * we're about to do_div by the stripe_len so lets make sure 2359 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2245 * we end up with something bigger than a stripe 2360 * we end up with something bigger than a stripe
2246 */ 2361 */
2247 calc_size = max_t(u64, calc_size, stripe_len * 4); 2362 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2363
2364 do_div(calc_size, BTRFS_STRIPE_LEN);
2365 calc_size *= BTRFS_STRIPE_LEN;
2366
2367 return calc_size;
2368}
2248 2369
2249 do_div(calc_size, stripe_len); 2370static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2250 calc_size *= stripe_len; 2371 int num_stripes)
2372{
2373 struct map_lookup *new;
2374 size_t len = map_lookup_size(num_stripes);
2375
2376 BUG_ON(map->num_stripes < num_stripes);
2377
2378 if (map->num_stripes == num_stripes)
2379 return map;
2380
2381 new = kmalloc(len, GFP_NOFS);
2382 if (!new) {
2383 /* just change map->num_stripes */
2384 map->num_stripes = num_stripes;
2385 return map;
2386 }
2387
2388 memcpy(new, map, len);
2389 new->num_stripes = num_stripes;
2390 kfree(map);
2391 return new;
2392}
2393
2394/*
2395 * helper to allocate device space from btrfs_device_info, in which we stored
2396 * max free space information of every device. It is used when we can not
2397 * allocate chunks by default size.
2398 *
2399 * By this helper, we can allocate a new chunk as larger as possible.
2400 */
2401static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2402 struct btrfs_fs_devices *fs_devices,
2403 struct btrfs_device_info *devices,
2404 int nr_device, u64 type,
2405 struct map_lookup **map_lookup,
2406 int min_stripes, u64 *stripe_size)
2407{
2408 int i, index, sort_again = 0;
2409 int min_devices = min_stripes;
2410 u64 max_avail, min_free;
2411 struct map_lookup *map = *map_lookup;
2412 int ret;
2413
2414 if (nr_device < min_stripes)
2415 return -ENOSPC;
2416
2417 btrfs_descending_sort_devices(devices, nr_device);
2418
2419 max_avail = devices[0].max_avail;
2420 if (!max_avail)
2421 return -ENOSPC;
2422
2423 for (i = 0; i < nr_device; i++) {
2424 /*
2425 * if dev_offset = 0, it means the free space of this device
2426 * is less than what we need, and we didn't search max avail
2427 * extent on this device, so do it now.
2428 */
2429 if (!devices[i].dev_offset) {
2430 ret = find_free_dev_extent(trans, devices[i].dev,
2431 max_avail,
2432 &devices[i].dev_offset,
2433 &devices[i].max_avail);
2434 if (ret != 0 && ret != -ENOSPC)
2435 return ret;
2436 sort_again = 1;
2437 }
2438 }
2439
2440 /* we update the max avail free extent of each devices, sort again */
2441 if (sort_again)
2442 btrfs_descending_sort_devices(devices, nr_device);
2443
2444 if (type & BTRFS_BLOCK_GROUP_DUP)
2445 min_devices = 1;
2446
2447 if (!devices[min_devices - 1].max_avail)
2448 return -ENOSPC;
2449
2450 max_avail = devices[min_devices - 1].max_avail;
2451 if (type & BTRFS_BLOCK_GROUP_DUP)
2452 do_div(max_avail, 2);
2453
2454 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2455 min_stripes, 1);
2456 if (type & BTRFS_BLOCK_GROUP_DUP)
2457 min_free = max_avail * 2;
2458 else
2459 min_free = max_avail;
2460
2461 if (min_free > devices[min_devices - 1].max_avail)
2462 return -ENOSPC;
2463
2464 map = __shrink_map_lookup_stripes(map, min_stripes);
2465 *stripe_size = max_avail;
2466
2467 index = 0;
2468 for (i = 0; i < min_stripes; i++) {
2469 map->stripes[i].dev = devices[index].dev;
2470 map->stripes[i].physical = devices[index].dev_offset;
2471 if (type & BTRFS_BLOCK_GROUP_DUP) {
2472 i++;
2473 map->stripes[i].dev = devices[index].dev;
2474 map->stripes[i].physical = devices[index].dev_offset +
2475 max_avail;
2476 }
2477 index++;
2478 }
2479 *map_lookup = map;
2480
2481 return 0;
2482}
2483
2484static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2485 struct btrfs_root *extent_root,
2486 struct map_lookup **map_ret,
2487 u64 *num_bytes, u64 *stripe_size,
2488 u64 start, u64 type)
2489{
2490 struct btrfs_fs_info *info = extent_root->fs_info;
2491 struct btrfs_device *device = NULL;
2492 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2493 struct list_head *cur;
2494 struct map_lookup *map;
2495 struct extent_map_tree *em_tree;
2496 struct extent_map *em;
2497 struct btrfs_device_info *devices_info;
2498 struct list_head private_devs;
2499 u64 calc_size = 1024 * 1024 * 1024;
2500 u64 min_free;
2501 u64 avail;
2502 u64 dev_offset;
2503 int num_stripes;
2504 int min_stripes;
2505 int sub_stripes;
2506 int min_devices; /* the min number of devices we need */
2507 int i;
2508 int ret;
2509 int index;
2510
2511 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2512 (type & BTRFS_BLOCK_GROUP_DUP)) {
2513 WARN_ON(1);
2514 type &= ~BTRFS_BLOCK_GROUP_DUP;
2515 }
2516 if (list_empty(&fs_devices->alloc_list))
2517 return -ENOSPC;
2518
2519 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2520 &min_stripes, &sub_stripes);
2521 if (ret)
2522 return ret;
2523
2524 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2525 GFP_NOFS);
2526 if (!devices_info)
2527 return -ENOMEM;
2528
2529 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2530 if (!map) {
2531 ret = -ENOMEM;
2532 goto error;
2533 }
2534 map->num_stripes = num_stripes;
2251 2535
2252 cur = fs_devices->alloc_list.next; 2536 cur = fs_devices->alloc_list.next;
2253 index = 0; 2537 index = 0;
2538 i = 0;
2254 2539
2255 if (type & BTRFS_BLOCK_GROUP_DUP) 2540 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2541 num_stripes, 0);
2542
2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2256 min_free = calc_size * 2; 2544 min_free = calc_size * 2;
2257 else 2545 min_devices = 1;
2546 } else {
2258 min_free = calc_size; 2547 min_free = calc_size;
2259 2548 min_devices = min_stripes;
2260 /* 2549 }
2261 * we add 1MB because we never use the first 1MB of the device, unless
2262 * we've looped, then we are likely allocating the maximum amount of
2263 * space left already
2264 */
2265 if (!looped)
2266 min_free += 1024 * 1024;
2267 2550
2268 INIT_LIST_HEAD(&private_devs); 2551 INIT_LIST_HEAD(&private_devs);
2269 while (index < num_stripes) { 2552 while (index < num_stripes) {
@@ -2276,27 +2559,39 @@ again:
2276 cur = cur->next; 2559 cur = cur->next;
2277 2560
2278 if (device->in_fs_metadata && avail >= min_free) { 2561 if (device->in_fs_metadata && avail >= min_free) {
2279 ret = find_free_dev_extent(trans, device, 2562 ret = find_free_dev_extent(trans, device, min_free,
2280 min_free, &dev_offset, 2563 &devices_info[i].dev_offset,
2281 &max_avail); 2564 &devices_info[i].max_avail);
2282 if (ret == 0) { 2565 if (ret == 0) {
2283 list_move_tail(&device->dev_alloc_list, 2566 list_move_tail(&device->dev_alloc_list,
2284 &private_devs); 2567 &private_devs);
2285 map->stripes[index].dev = device; 2568 map->stripes[index].dev = device;
2286 map->stripes[index].physical = dev_offset; 2569 map->stripes[index].physical =
2570 devices_info[i].dev_offset;
2287 index++; 2571 index++;
2288 if (type & BTRFS_BLOCK_GROUP_DUP) { 2572 if (type & BTRFS_BLOCK_GROUP_DUP) {
2289 map->stripes[index].dev = device; 2573 map->stripes[index].dev = device;
2290 map->stripes[index].physical = 2574 map->stripes[index].physical =
2291 dev_offset + calc_size; 2575 devices_info[i].dev_offset +
2576 calc_size;
2292 index++; 2577 index++;
2293 } 2578 }
2294 } 2579 } else if (ret != -ENOSPC)
2295 } else if (device->in_fs_metadata && avail > max_avail) 2580 goto error;
2296 max_avail = avail; 2581
2582 devices_info[i].dev = device;
2583 i++;
2584 } else if (device->in_fs_metadata &&
2585 avail >= BTRFS_STRIPE_LEN) {
2586 devices_info[i].dev = device;
2587 devices_info[i].max_avail = avail;
2588 i++;
2589 }
2590
2297 if (cur == &fs_devices->alloc_list) 2591 if (cur == &fs_devices->alloc_list)
2298 break; 2592 break;
2299 } 2593 }
2594
2300 list_splice(&private_devs, &fs_devices->alloc_list); 2595 list_splice(&private_devs, &fs_devices->alloc_list);
2301 if (index < num_stripes) { 2596 if (index < num_stripes) {
2302 if (index >= min_stripes) { 2597 if (index >= min_stripes) {
@@ -2305,34 +2600,36 @@ again:
2305 num_stripes /= sub_stripes; 2600 num_stripes /= sub_stripes;
2306 num_stripes *= sub_stripes; 2601 num_stripes *= sub_stripes;
2307 } 2602 }
2308 looped = 1; 2603
2309 goto again; 2604 map = __shrink_map_lookup_stripes(map, num_stripes);
2310 } 2605 } else if (i >= min_devices) {
2311 if (!looped && max_avail > 0) { 2606 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2312 looped = 1; 2607 devices_info, i, type,
2313 calc_size = max_avail; 2608 &map, min_stripes,
2314 goto again; 2609 &calc_size);
2610 if (ret)
2611 goto error;
2612 } else {
2613 ret = -ENOSPC;
2614 goto error;
2315 } 2615 }
2316 kfree(map);
2317 return -ENOSPC;
2318 } 2616 }
2319 map->sector_size = extent_root->sectorsize; 2617 map->sector_size = extent_root->sectorsize;
2320 map->stripe_len = stripe_len; 2618 map->stripe_len = BTRFS_STRIPE_LEN;
2321 map->io_align = stripe_len; 2619 map->io_align = BTRFS_STRIPE_LEN;
2322 map->io_width = stripe_len; 2620 map->io_width = BTRFS_STRIPE_LEN;
2323 map->type = type; 2621 map->type = type;
2324 map->num_stripes = num_stripes;
2325 map->sub_stripes = sub_stripes; 2622 map->sub_stripes = sub_stripes;
2326 2623
2327 *map_ret = map; 2624 *map_ret = map;
2328 *stripe_size = calc_size; 2625 *stripe_size = calc_size;
2329 *num_bytes = chunk_bytes_by_type(type, calc_size, 2626 *num_bytes = chunk_bytes_by_type(type, calc_size,
2330 num_stripes, sub_stripes); 2627 map->num_stripes, sub_stripes);
2331 2628
2332 em = alloc_extent_map(GFP_NOFS); 2629 em = alloc_extent_map(GFP_NOFS);
2333 if (!em) { 2630 if (!em) {
2334 kfree(map); 2631 ret = -ENOMEM;
2335 return -ENOMEM; 2632 goto error;
2336 } 2633 }
2337 em->bdev = (struct block_device *)map; 2634 em->bdev = (struct block_device *)map;
2338 em->start = start; 2635 em->start = start;
@@ -2365,7 +2662,13 @@ again:
2365 index++; 2662 index++;
2366 } 2663 }
2367 2664
2665 kfree(devices_info);
2368 return 0; 2666 return 0;
2667
2668error:
2669 kfree(map);
2670 kfree(devices_info);
2671 return ret;
2369} 2672}
2370 2673
2371static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2674static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -3080,7 +3383,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3080 device->devid = devid; 3383 device->devid = devid;
3081 device->work.func = pending_bios_fn; 3384 device->work.func = pending_bios_fn;
3082 device->fs_devices = fs_devices; 3385 device->fs_devices = fs_devices;
3386 device->missing = 1;
3083 fs_devices->num_devices++; 3387 fs_devices->num_devices++;
3388 fs_devices->missing_devices++;
3084 spin_lock_init(&device->io_lock); 3389 spin_lock_init(&device->io_lock);
3085 INIT_LIST_HEAD(&device->dev_alloc_list); 3390 INIT_LIST_HEAD(&device->dev_alloc_list);
3086 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3391 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3583,15 @@ static int read_one_dev(struct btrfs_root *root,
3278 device = add_missing_dev(root, devid, dev_uuid); 3583 device = add_missing_dev(root, devid, dev_uuid);
3279 if (!device) 3584 if (!device)
3280 return -ENOMEM; 3585 return -ENOMEM;
3586 } else if (!device->missing) {
3587 /*
3588 * this happens when a device that was properly setup
3589 * in the device info lists suddenly goes bad.
3590 * device->bdev is NULL, and so we have to set
3591 * device->missing to one here
3592 */
3593 root->fs_info->fs_devices->missing_devices++;
3594 device->missing = 1;
3281 } 3595 }
3282 } 3596 }
3283 3597
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -44,12 +47,13 @@ struct btrfs_device {
44 47
45 int writeable; 48 int writeable;
46 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing;
47 51
48 spinlock_t io_lock; 52 spinlock_t io_lock;
49 53
50 struct block_device *bdev; 54 struct block_device *bdev;
51 55
52 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
53 fmode_t mode; 57 fmode_t mode;
54 58
55 char *name; 59 char *name;
@@ -93,6 +97,7 @@ struct btrfs_fs_devices {
93 u64 num_devices; 97 u64 num_devices;
94 u64 open_devices; 98 u64 open_devices;
95 u64 rw_devices; 99 u64 rw_devices;
100 u64 missing_devices;
96 u64 total_rw_bytes; 101 u64 total_rw_bytes;
97 struct block_device *latest_bdev; 102 struct block_device *latest_bdev;
98 103
@@ -134,6 +139,30 @@ struct btrfs_multi_bio {
134 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
135}; 140};
136 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
137#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
138 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
139 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 317 size_t size, int flags)
318{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
319 /* 328 /*
320 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 345
337int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
339 /* 357 /*
340 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 95 *total_out = 0;
206 *total_in = 0; 96 *total_in = 0;
207 97
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 100 ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 108 data_in = kmap(in_page);
223 109
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
225 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
226 pages[0] = out_page; 116 pages[0] = out_page;
227 nr_pages = 1; 117 nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 150 goto out;
261 } 151 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
263 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
265 nr_pages++; 159 nr_pages++;
@@ -314,55 +208,26 @@ out:
314 kunmap(in_page); 208 kunmap(in_page);
315 page_cache_release(in_page); 209 page_cache_release(in_page);
316 } 210 }
317 free_workspace(workspace);
318 return ret; 211 return ret;
319} 212}
320 213
321/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
323 * 216 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
325 * 218 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 219{
343 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 223 char *data_in;
347 size_t total_out = 0; 224 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 229 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 230 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 231
367 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 240 pg_offset = 0;
378 241
379 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 252
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 255 return -1;
393 goto out;
394 } 256 }
395 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 260 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 261
405 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
407 264
408 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
409 266 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 267 break;
423 }
424 268
425 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 270 total_out, disk_start,
427 goto next; 271 bvec, vcnt,
428 272 &page_out_index, &pg_offset);
429 /* 273 if (ret2 == 0) {
430 * the start of the data we care about is offset into 274 ret = 0;
431 * the middle of our working buffer 275 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 276 }
492next: 277
493 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 280
@@ -516,35 +301,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 302 if (data_in)
518 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 304 return ret;
522} 305}
523 306
524/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 313 int ret = 0;
535 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 316 unsigned long total_out = 0;
539 char *kaddr; 317 char *kaddr;
540 318
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 336
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 339 return -1;
569 goto out;
570 } 340 }
571 341
572 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
616 ret = 0; 386 ret = 0;
617 387
618 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 389 return ret;
622} 390}
623 391
624void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
625{ 393 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
627} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};