aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c329
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h48
-rw-r--r--fs/btrfs/disk-io.c412
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c90
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode.c90
-rw-r--r--fs/btrfs/ioctl.c220
-rw-r--r--fs/btrfs/ioctl.h12
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c18
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/super.c281
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/volumes.c626
-rw-r--r--fs/btrfs/volumes.h27
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
29 files changed, 2490 insertions, 623 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 63 if (IS_ERR(acl)) {
64 kfree(value);
64 return acl; 65 return acl;
66 }
65 set_cached_acl(inode, type, acl); 67 set_cached_acl(inode, type, acl);
66 } 68 }
67 kfree(value); 69 kfree(value);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
157 /* 157 /*
158 * always compress this one file 158 * always compress this one file
159 */ 159 */
160 unsigned force_compress:1; 160 unsigned force_compress:4;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
173 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
174 * the decompression. 177 * the decompression.
175 */ 178 */
176 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
177 cb->start, 180 cb->compressed_pages,
178 cb->orig_bio->bi_io_vec, 181 cb->start,
179 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
180 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
181csum_failed: 185csum_failed:
182 if (ret) 186 if (ret)
183 cb->errors = 1; 187 cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
588 592
589 cb->len = uncompressed_len; 593 cb->len = uncompressed_len;
590 cb->compressed_len = compressed_len; 594 cb->compressed_len = compressed_len;
595 cb->compress_type = extent_compress_type(bio_flags);
591 cb->orig_bio = bio; 596 cb->orig_bio = bio;
592 597
593 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
677 bio_put(comp_bio); 682 bio_put(comp_bio);
678 return 0; 683 return 0;
679} 684}
685
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
687static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
688static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
689static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
690static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
691
692struct btrfs_compress_op *btrfs_compress_op[] = {
693 &btrfs_zlib_compress,
694 &btrfs_lzo_compress,
695};
696
697int __init btrfs_init_compress(void)
698{
699 int i;
700
701 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
702 INIT_LIST_HEAD(&comp_idle_workspace[i]);
703 spin_lock_init(&comp_workspace_lock[i]);
704 atomic_set(&comp_alloc_workspace[i], 0);
705 init_waitqueue_head(&comp_workspace_wait[i]);
706 }
707 return 0;
708}
709
710/*
711 * this finds an available workspace or allocates a new one
712 * ERR_PTR is returned if things go bad.
713 */
714static struct list_head *find_workspace(int type)
715{
716 struct list_head *workspace;
717 int cpus = num_online_cpus();
718 int idx = type - 1;
719
720 struct list_head *idle_workspace = &comp_idle_workspace[idx];
721 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
722 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
723 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
724 int *num_workspace = &comp_num_workspace[idx];
725again:
726 spin_lock(workspace_lock);
727 if (!list_empty(idle_workspace)) {
728 workspace = idle_workspace->next;
729 list_del(workspace);
730 (*num_workspace)--;
731 spin_unlock(workspace_lock);
732 return workspace;
733
734 }
735 if (atomic_read(alloc_workspace) > cpus) {
736 DEFINE_WAIT(wait);
737
738 spin_unlock(workspace_lock);
739 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
740 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
741 schedule();
742 finish_wait(workspace_wait, &wait);
743 goto again;
744 }
745 atomic_inc(alloc_workspace);
746 spin_unlock(workspace_lock);
747
748 workspace = btrfs_compress_op[idx]->alloc_workspace();
749 if (IS_ERR(workspace)) {
750 atomic_dec(alloc_workspace);
751 wake_up(workspace_wait);
752 }
753 return workspace;
754}
755
756/*
757 * put a workspace struct back on the list or free it if we have enough
758 * idle ones sitting around
759 */
760static void free_workspace(int type, struct list_head *workspace)
761{
762 int idx = type - 1;
763 struct list_head *idle_workspace = &comp_idle_workspace[idx];
764 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
765 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
766 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
767 int *num_workspace = &comp_num_workspace[idx];
768
769 spin_lock(workspace_lock);
770 if (*num_workspace < num_online_cpus()) {
771 list_add_tail(workspace, idle_workspace);
772 (*num_workspace)++;
773 spin_unlock(workspace_lock);
774 goto wake;
775 }
776 spin_unlock(workspace_lock);
777
778 btrfs_compress_op[idx]->free_workspace(workspace);
779 atomic_dec(alloc_workspace);
780wake:
781 if (waitqueue_active(workspace_wait))
782 wake_up(workspace_wait);
783}
784
785/*
786 * cleanup function for module exit
787 */
788static void free_workspaces(void)
789{
790 struct list_head *workspace;
791 int i;
792
793 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
794 while (!list_empty(&comp_idle_workspace[i])) {
795 workspace = comp_idle_workspace[i].next;
796 list_del(workspace);
797 btrfs_compress_op[i]->free_workspace(workspace);
798 atomic_dec(&comp_alloc_workspace[i]);
799 }
800 }
801}
802
803/*
804 * given an address space and start/len, compress the bytes.
805 *
806 * pages are allocated to hold the compressed result and stored
807 * in 'pages'
808 *
809 * out_pages is used to return the number of pages allocated. There
810 * may be pages allocated even if we return an error
811 *
812 * total_in is used to return the number of bytes actually read. It
813 * may be smaller then len if we had to exit early because we
814 * ran out of room in the pages array or because we cross the
815 * max_out threshold.
816 *
817 * total_out is used to return the total number of compressed bytes
818 *
819 * max_out tells us the max number of bytes that we're allowed to
820 * stuff into pages
821 */
822int btrfs_compress_pages(int type, struct address_space *mapping,
823 u64 start, unsigned long len,
824 struct page **pages,
825 unsigned long nr_dest_pages,
826 unsigned long *out_pages,
827 unsigned long *total_in,
828 unsigned long *total_out,
829 unsigned long max_out)
830{
831 struct list_head *workspace;
832 int ret;
833
834 workspace = find_workspace(type);
835 if (IS_ERR(workspace))
836 return -1;
837
838 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
839 start, len, pages,
840 nr_dest_pages, out_pages,
841 total_in, total_out,
842 max_out);
843 free_workspace(type, workspace);
844 return ret;
845}
846
847/*
848 * pages_in is an array of pages with compressed data.
849 *
850 * disk_start is the starting logical offset of this array in the file
851 *
852 * bvec is a bio_vec of pages from the file that we want to decompress into
853 *
854 * vcnt is the count of pages in the biovec
855 *
856 * srclen is the number of bytes in pages_in
857 *
858 * The basic idea is that we have a bio that was created by readpages.
859 * The pages in the bio are for the uncompressed data, and they may not
860 * be contiguous. They all correspond to the range of bytes covered by
861 * the compressed extent.
862 */
863int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
864 struct bio_vec *bvec, int vcnt, size_t srclen)
865{
866 struct list_head *workspace;
867 int ret;
868
869 workspace = find_workspace(type);
870 if (IS_ERR(workspace))
871 return -ENOMEM;
872
873 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
874 disk_start,
875 bvec, vcnt, srclen);
876 free_workspace(type, workspace);
877 return ret;
878}
879
880/*
881 * a less complex decompression routine. Our compressed data fits in a
882 * single page, and we want to read a single page out of it.
883 * start_byte tells us the offset into the compressed data we're interested in
884 */
885int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
886 unsigned long start_byte, size_t srclen, size_t destlen)
887{
888 struct list_head *workspace;
889 int ret;
890
891 workspace = find_workspace(type);
892 if (IS_ERR(workspace))
893 return -ENOMEM;
894
895 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
896 dest_page, start_byte,
897 srclen, destlen);
898
899 free_workspace(type, workspace);
900 return ret;
901}
902
903void __exit btrfs_exit_compress(void)
904{
905 free_workspaces();
906}
907
908/*
909 * Copy uncompressed data from working buffer to pages.
910 *
911 * buf_start is the byte offset we're of the start of our workspace buffer.
912 *
913 * total_out is the last byte of the buffer
914 */
915int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
916 unsigned long total_out, u64 disk_start,
917 struct bio_vec *bvec, int vcnt,
918 unsigned long *page_index,
919 unsigned long *pg_offset)
920{
921 unsigned long buf_offset;
922 unsigned long current_buf_start;
923 unsigned long start_byte;
924 unsigned long working_bytes = total_out - buf_start;
925 unsigned long bytes;
926 char *kaddr;
927 struct page *page_out = bvec[*page_index].bv_page;
928
929 /*
930 * start byte is the first byte of the page we're currently
931 * copying into relative to the start of the compressed data.
932 */
933 start_byte = page_offset(page_out) - disk_start;
934
935 /* we haven't yet hit data corresponding to this page */
936 if (total_out <= start_byte)
937 return 1;
938
939 /*
940 * the start of the data we care about is offset into
941 * the middle of our working buffer
942 */
943 if (total_out > start_byte && buf_start < start_byte) {
944 buf_offset = start_byte - buf_start;
945 working_bytes -= buf_offset;
946 } else {
947 buf_offset = 0;
948 }
949 current_buf_start = buf_start;
950
951 /* copy bytes from the working buffer into the pages */
952 while (working_bytes > 0) {
953 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
954 PAGE_CACHE_SIZE - buf_offset);
955 bytes = min(bytes, working_bytes);
956 kaddr = kmap_atomic(page_out, KM_USER0);
957 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
958 kunmap_atomic(kaddr, KM_USER0);
959 flush_dcache_page(page_out);
960
961 *pg_offset += bytes;
962 buf_offset += bytes;
963 working_bytes -= bytes;
964 current_buf_start += bytes;
965
966 /* check if we need to pick another page */
967 if (*pg_offset == PAGE_CACHE_SIZE) {
968 (*page_index)++;
969 if (*page_index >= vcnt)
970 return 0;
971
972 page_out = bvec[*page_index].bv_page;
973 *pg_offset = 0;
974 start_byte = page_offset(page_out) - disk_start;
975
976 /*
977 * make sure our new page is covered by this
978 * working buffer
979 */
980 if (total_out <= start_byte)
981 return 1;
982
983 /*
984 * the next page in the biovec might not be adjacent
985 * to the last page, but it might still be found
986 * inside this working buffer. bump our offset pointer
987 */
988 if (total_out > start_byte &&
989 current_buf_start < start_byte) {
990 buf_offset = start_byte - buf_start;
991 working_bytes = total_out - start_byte;
992 current_buf_start = buf_start + buf_offset;
993 }
994 }
995 }
996
997 return 1;
998}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2515 2517
2516 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2517 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2519 2524
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2765 2770
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2767 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2769 2777
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b875d445ea81..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -295,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
295#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
298#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
299#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
300 308
@@ -399,13 +407,15 @@ struct btrfs_super_block {
399#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
400#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
401#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
402 411
403#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
404#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
405#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
406 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
407 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
408 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
409 419
410/* 420/*
411 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -552,9 +562,11 @@ struct btrfs_timespec {
552} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
553 563
554enum btrfs_compression_type { 564enum btrfs_compression_type {
555 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
556 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
557 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
558}; 570};
559 571
560struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -598,6 +610,8 @@ struct btrfs_dir_item {
598 u8 type; 610 u8 type;
599} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
600 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
601struct btrfs_root_item { 615struct btrfs_root_item {
602 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
603 __le64 generation; 617 __le64 generation;
@@ -896,7 +910,8 @@ struct btrfs_fs_info {
896 */ 910 */
897 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
898 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
899 unsigned long mount_opt; 913 unsigned long mount_opt:20;
914 unsigned long compress_type:4;
900 u64 max_inline; 915 u64 max_inline;
901 u64 alloc_start; 916 u64 alloc_start;
902 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
@@ -1051,6 +1066,9 @@ struct btrfs_fs_info {
1051 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
1052 1067
1053 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
1054}; 1072};
1055 1073
1056/* 1074/*
@@ -1894,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1894BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1895 last_snapshot, 64); 1913 last_snapshot, 64);
1896 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1897/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1898 1921
1899BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2146,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2146int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2147 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2148u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2149void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2150void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2151int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2189,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2189int btrfs_set_block_group_rw(struct btrfs_root *root, 2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2190 struct btrfs_block_group_cache *cache); 2214 struct btrfs_block_group_cache *cache);
2191void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2221
2192/* ctree.c */ 2222/* ctree.c */
2193int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2194 int level, int *slot); 2224 int level, int *slot);
@@ -2542,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2542/* super.c */ 2572/* super.c */
2543int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2544int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2545 2583
2546/* acl.c */ 2584/* acl.c */
2547#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
44static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
45static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
46static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 61
48/* 62/*
49 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
353 WARN_ON(len == 0); 367 WARN_ON(len == 0);
354 368
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
358 BUG_ON(ret); 376 BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
427 WARN_ON(len == 0); 445 WARN_ON(len == 0);
428 446
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
430 452
431 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 454 if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1145 } 1167 }
1146 btrfs_free_path(path); 1168 btrfs_free_path(path);
1147 if (ret) { 1169 if (ret) {
1170 kfree(root);
1148 if (ret > 0) 1171 if (ret > 0)
1149 ret = -ENOENT; 1172 ret = -ENOENT;
1150 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1713 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1736 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1714 1737
1715 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1738 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1716 if (!bh) 1739 if (!bh) {
1740 err = -EINVAL;
1717 goto fail_iput; 1741 goto fail_iput;
1742 }
1718 1743
1719 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1744 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1720 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1745 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1727 if (!btrfs_super_root(disk_super)) 1752 if (!btrfs_super_root(disk_super))
1728 goto fail_iput; 1753 goto fail_iput;
1729 1754
1755 /* check FS state, whether FS is broken. */
1756 fs_info->fs_state |= btrfs_super_flags(disk_super);
1757
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759
1730 ret = btrfs_parse_options(tree_root, options); 1760 ret = btrfs_parse_options(tree_root, options);
1731 if (ret) { 1761 if (ret) {
1732 err = ret; 1762 err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1744 } 1774 }
1745 1775
1746 features = btrfs_super_incompat_flags(disk_super); 1776 features = btrfs_super_incompat_flags(disk_super);
1747 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1777 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1748 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1778 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1749 btrfs_set_super_incompat_flags(disk_super, features); 1779 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1750 } 1780 btrfs_set_super_incompat_flags(disk_super, features);
1751 1781
1752 features = btrfs_super_compat_ro_flags(disk_super) & 1782 features = btrfs_super_compat_ro_flags(disk_super) &
1753 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1783 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1957 btrfs_set_opt(fs_info->mount_opt, SSD); 1987 btrfs_set_opt(fs_info->mount_opt, SSD);
1958 } 1988 }
1959 1989
1960 if (btrfs_super_log_root(disk_super) != 0) { 1990 /* do not make disk changes in broken FS */
1991 if (btrfs_super_log_root(disk_super) != 0 &&
1992 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1961 u64 bytenr = btrfs_super_log_root(disk_super); 1993 u64 bytenr = btrfs_super_log_root(disk_super);
1962 1994
1963 if (fs_devices->rw_devices == 0) { 1995 if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
2442 smp_mb(); 2474 smp_mb();
2443 2475
2444 btrfs_put_block_group_cache(fs_info); 2476 btrfs_put_block_group_cache(fs_info);
2477
2478 /*
2479 * Here come 2 situations when btrfs is broken to flip readonly:
2480 *
2481 * 1. when btrfs flips readonly somewhere else before
2482 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2483 * and btrfs will skip to write sb directly to keep
2484 * ERROR state on disk.
2485 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then.
2490 */
2445 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2491 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2446 ret = btrfs_commit_super(root); 2492 ret = btrfs_commit_super(root);
2493 if (ret)
2494 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2495 }
2496
2497 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2498 ret = btrfs_error_commit_super(root);
2447 if (ret) 2499 if (ret)
2448 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2500 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2449 } 2501 }
@@ -2619,6 +2671,352 @@ out:
2619 return 0; 2671 return 0;
2620} 2672}
2621 2673
2674static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2675 int read_only)
2676{
2677 if (read_only)
2678 return;
2679
2680 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2681 printk(KERN_WARNING "warning: mount fs with errors, "
2682 "running btrfsck is recommended\n");
2683}
2684
2685int btrfs_error_commit_super(struct btrfs_root *root)
2686{
2687 int ret;
2688
2689 mutex_lock(&root->fs_info->cleaner_mutex);
2690 btrfs_run_delayed_iputs(root);
2691 mutex_unlock(&root->fs_info->cleaner_mutex);
2692
2693 down_write(&root->fs_info->cleanup_work_sem);
2694 up_write(&root->fs_info->cleanup_work_sem);
2695
2696 /* cleanup FS via transaction */
2697 btrfs_cleanup_transaction(root);
2698
2699 ret = write_ctree_super(NULL, root, 0);
2700
2701 return ret;
2702}
2703
2704static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2705{
2706 struct btrfs_inode *btrfs_inode;
2707 struct list_head splice;
2708
2709 INIT_LIST_HEAD(&splice);
2710
2711 mutex_lock(&root->fs_info->ordered_operations_mutex);
2712 spin_lock(&root->fs_info->ordered_extent_lock);
2713
2714 list_splice_init(&root->fs_info->ordered_operations, &splice);
2715 while (!list_empty(&splice)) {
2716 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2717 ordered_operations);
2718
2719 list_del_init(&btrfs_inode->ordered_operations);
2720
2721 btrfs_invalidate_inodes(btrfs_inode->root);
2722 }
2723
2724 spin_unlock(&root->fs_info->ordered_extent_lock);
2725 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2726
2727 return 0;
2728}
2729
2730static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2731{
2732 struct list_head splice;
2733 struct btrfs_ordered_extent *ordered;
2734 struct inode *inode;
2735
2736 INIT_LIST_HEAD(&splice);
2737
2738 spin_lock(&root->fs_info->ordered_extent_lock);
2739
2740 list_splice_init(&root->fs_info->ordered_extents, &splice);
2741 while (!list_empty(&splice)) {
2742 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2743 root_extent_list);
2744
2745 list_del_init(&ordered->root_extent_list);
2746 atomic_inc(&ordered->refs);
2747
2748 /* the inode may be getting freed (in sys_unlink path). */
2749 inode = igrab(ordered->inode);
2750
2751 spin_unlock(&root->fs_info->ordered_extent_lock);
2752 if (inode)
2753 iput(inode);
2754
2755 atomic_set(&ordered->refs, 1);
2756 btrfs_put_ordered_extent(ordered);
2757
2758 spin_lock(&root->fs_info->ordered_extent_lock);
2759 }
2760
2761 spin_unlock(&root->fs_info->ordered_extent_lock);
2762
2763 return 0;
2764}
2765
2766static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2767 struct btrfs_root *root)
2768{
2769 struct rb_node *node;
2770 struct btrfs_delayed_ref_root *delayed_refs;
2771 struct btrfs_delayed_ref_node *ref;
2772 int ret = 0;
2773
2774 delayed_refs = &trans->delayed_refs;
2775
2776 spin_lock(&delayed_refs->lock);
2777 if (delayed_refs->num_entries == 0) {
2778 printk(KERN_INFO "delayed_refs has NO entry\n");
2779 return ret;
2780 }
2781
2782 node = rb_first(&delayed_refs->root);
2783 while (node) {
2784 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2785 node = rb_next(node);
2786
2787 ref->in_tree = 0;
2788 rb_erase(&ref->rb_node, &delayed_refs->root);
2789 delayed_refs->num_entries--;
2790
2791 atomic_set(&ref->refs, 1);
2792 if (btrfs_delayed_ref_is_head(ref)) {
2793 struct btrfs_delayed_ref_head *head;
2794
2795 head = btrfs_delayed_node_to_head(ref);
2796 mutex_lock(&head->mutex);
2797 kfree(head->extent_op);
2798 delayed_refs->num_heads--;
2799 if (list_empty(&head->cluster))
2800 delayed_refs->num_heads_ready--;
2801 list_del_init(&head->cluster);
2802 mutex_unlock(&head->mutex);
2803 }
2804
2805 spin_unlock(&delayed_refs->lock);
2806 btrfs_put_delayed_ref(ref);
2807
2808 cond_resched();
2809 spin_lock(&delayed_refs->lock);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813
2814 return ret;
2815}
2816
2817static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2818{
2819 struct btrfs_pending_snapshot *snapshot;
2820 struct list_head splice;
2821
2822 INIT_LIST_HEAD(&splice);
2823
2824 list_splice_init(&t->pending_snapshots, &splice);
2825
2826 while (!list_empty(&splice)) {
2827 snapshot = list_entry(splice.next,
2828 struct btrfs_pending_snapshot,
2829 list);
2830
2831 list_del_init(&snapshot->list);
2832
2833 kfree(snapshot);
2834 }
2835
2836 return 0;
2837}
2838
2839static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2840{
2841 struct btrfs_inode *btrfs_inode;
2842 struct list_head splice;
2843
2844 INIT_LIST_HEAD(&splice);
2845
2846 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2847
2848 spin_lock(&root->fs_info->delalloc_lock);
2849
2850 while (!list_empty(&splice)) {
2851 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2852 delalloc_inodes);
2853
2854 list_del_init(&btrfs_inode->delalloc_inodes);
2855
2856 btrfs_invalidate_inodes(btrfs_inode->root);
2857 }
2858
2859 spin_unlock(&root->fs_info->delalloc_lock);
2860
2861 return 0;
2862}
2863
2864static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2865 struct extent_io_tree *dirty_pages,
2866 int mark)
2867{
2868 int ret;
2869 struct page *page;
2870 struct inode *btree_inode = root->fs_info->btree_inode;
2871 struct extent_buffer *eb;
2872 u64 start = 0;
2873 u64 end;
2874 u64 offset;
2875 unsigned long index;
2876
2877 while (1) {
2878 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2879 mark);
2880 if (ret)
2881 break;
2882
2883 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2884 while (start <= end) {
2885 index = start >> PAGE_CACHE_SHIFT;
2886 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2887 page = find_get_page(btree_inode->i_mapping, index);
2888 if (!page)
2889 continue;
2890 offset = page_offset(page);
2891
2892 spin_lock(&dirty_pages->buffer_lock);
2893 eb = radix_tree_lookup(
2894 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2895 offset >> PAGE_CACHE_SHIFT);
2896 spin_unlock(&dirty_pages->buffer_lock);
2897 if (eb) {
2898 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2899 &eb->bflags);
2900 atomic_set(&eb->refs, 1);
2901 }
2902 if (PageWriteback(page))
2903 end_page_writeback(page);
2904
2905 lock_page(page);
2906 if (PageDirty(page)) {
2907 clear_page_dirty_for_io(page);
2908 spin_lock_irq(&page->mapping->tree_lock);
2909 radix_tree_tag_clear(&page->mapping->page_tree,
2910 page_index(page),
2911 PAGECACHE_TAG_DIRTY);
2912 spin_unlock_irq(&page->mapping->tree_lock);
2913 }
2914
2915 page->mapping->a_ops->invalidatepage(page, 0);
2916 unlock_page(page);
2917 }
2918 }
2919
2920 return ret;
2921}
2922
2923static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2924 struct extent_io_tree *pinned_extents)
2925{
2926 struct extent_io_tree *unpin;
2927 u64 start;
2928 u64 end;
2929 int ret;
2930
2931 unpin = pinned_extents;
2932 while (1) {
2933 ret = find_first_extent_bit(unpin, 0, &start, &end,
2934 EXTENT_DIRTY);
2935 if (ret)
2936 break;
2937
2938 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2940
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end);
2943 cond_resched();
2944 }
2945
2946 return 0;
2947}
2948
2949static int btrfs_cleanup_transaction(struct btrfs_root *root)
2950{
2951 struct btrfs_transaction *t;
2952 LIST_HEAD(list);
2953
2954 WARN_ON(1);
2955
2956 mutex_lock(&root->fs_info->trans_mutex);
2957 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2958
2959 list_splice_init(&root->fs_info->trans_list, &list);
2960 while (!list_empty(&list)) {
2961 t = list_entry(list.next, struct btrfs_transaction, list);
2962 if (!t)
2963 break;
2964
2965 btrfs_destroy_ordered_operations(root);
2966
2967 btrfs_destroy_ordered_extents(root);
2968
2969 btrfs_destroy_delayed_refs(t, root);
2970
2971 btrfs_block_rsv_release(root,
2972 &root->fs_info->trans_block_rsv,
2973 t->dirty_pages.dirty_bytes);
2974
2975 /* FIXME: cleanup wait for commit */
2976 t->in_commit = 1;
2977 t->blocked = 1;
2978 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2979 wake_up(&root->fs_info->transaction_blocked_wait);
2980
2981 t->blocked = 0;
2982 if (waitqueue_active(&root->fs_info->transaction_wait))
2983 wake_up(&root->fs_info->transaction_wait);
2984 mutex_unlock(&root->fs_info->trans_mutex);
2985
2986 mutex_lock(&root->fs_info->trans_mutex);
2987 t->commit_done = 1;
2988 if (waitqueue_active(&t->commit_wait))
2989 wake_up(&t->commit_wait);
2990 mutex_unlock(&root->fs_info->trans_mutex);
2991
2992 mutex_lock(&root->fs_info->trans_mutex);
2993
2994 btrfs_destroy_pending_snapshots(t);
2995
2996 btrfs_destroy_delalloc_inodes(root);
2997
2998 spin_lock(&root->fs_info->new_trans_lock);
2999 root->fs_info->running_transaction = NULL;
3000 spin_unlock(&root->fs_info->new_trans_lock);
3001
3002 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3003 EXTENT_DIRTY);
3004
3005 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents);
3007
3008 t->use_count = 0;
3009 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t);
3012 }
3013
3014 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3015 mutex_unlock(&root->fs_info->trans_mutex);
3016
3017 return 0;
3018}
3019
2622static struct extent_io_ops btree_extent_io_ops = { 3020static struct extent_io_ops btree_extent_io_ops = {
2623 .write_cache_pages_lock_hook = btree_lock_page_hook, 3021 .write_cache_pages_lock_hook = btree_lock_page_hook,
2624 .readpage_end_io_hook = btree_readpage_end_io_hook, 3022 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3089 return btrfs_reduce_alloc_profile(root, flags); 3089 return btrfs_reduce_alloc_profile(root, flags);
3090} 3090}
3091 3091
3092static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3092u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3093{ 3093{
3094 u64 flags; 3094 u64 flags;
3095 3095
@@ -3161,8 +3161,12 @@ alloc:
3161 bytes + 2 * 1024 * 1024, 3161 bytes + 2 * 1024 * 1024,
3162 alloc_target, 0); 3162 alloc_target, 0);
3163 btrfs_end_transaction(trans, root); 3163 btrfs_end_transaction(trans, root);
3164 if (ret < 0) 3164 if (ret < 0) {
3165 return ret; 3165 if (ret != -ENOSPC)
3166 return ret;
3167 else
3168 goto commit_trans;
3169 }
3166 3170
3167 if (!data_sinfo) { 3171 if (!data_sinfo) {
3168 btrfs_set_inode_space_info(root, inode); 3172 btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
3173 spin_unlock(&data_sinfo->lock); 3177 spin_unlock(&data_sinfo->lock);
3174 3178
3175 /* commit the current transaction and try again */ 3179 /* commit the current transaction and try again */
3180commit_trans:
3176 if (!committed && !root->fs_info->open_ioctl_trans) { 3181 if (!committed && !root->fs_info->open_ioctl_trans) {
3177 committed = 1; 3182 committed = 1;
3178 trans = btrfs_join_transaction(root, 1); 3183 trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3721 return 0; 3726 return 0;
3722 } 3727 }
3723 3728
3724 WARN_ON(1);
3725 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3726 block_rsv->size, block_rsv->reserved,
3727 block_rsv->freed[0], block_rsv->freed[1]);
3728
3729 return -ENOSPC; 3729 return -ENOSPC;
3730} 3730}
3731 3731
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7970 7970
7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7972 sinfo->bytes_may_use + sinfo->bytes_readonly + 7972 sinfo->bytes_may_use + sinfo->bytes_readonly +
7973 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 7973 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7974 sinfo->bytes_readonly += num_bytes; 7974 sinfo->bytes_readonly += num_bytes;
7975 sinfo->bytes_reserved += cache->reserved_pinned; 7975 sinfo->bytes_reserved += cache->reserved_pinned;
7976 cache->reserved_pinned = 0; 7976 cache->reserved_pinned = 0;
7977 cache->ro = 1; 7977 cache->ro = 1;
7978 ret = 0; 7978 ret = 0;
7979 } 7979 }
7980
7980 spin_unlock(&cache->lock); 7981 spin_unlock(&cache->lock);
7981 spin_unlock(&sinfo->lock); 7982 spin_unlock(&sinfo->lock);
7982 return ret; 7983 return ret;
@@ -8012,6 +8013,62 @@ out:
8012 return ret; 8013 return ret;
8013} 8014}
8014 8015
8016/*
8017 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account.
8019 */
8020static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8021{
8022 struct btrfs_block_group_cache *block_group;
8023 u64 free_bytes = 0;
8024 int factor;
8025
8026 list_for_each_entry(block_group, groups_list, list) {
8027 spin_lock(&block_group->lock);
8028
8029 if (!block_group->ro) {
8030 spin_unlock(&block_group->lock);
8031 continue;
8032 }
8033
8034 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8035 BTRFS_BLOCK_GROUP_RAID10 |
8036 BTRFS_BLOCK_GROUP_DUP))
8037 factor = 2;
8038 else
8039 factor = 1;
8040
8041 free_bytes += (block_group->key.offset -
8042 btrfs_block_group_used(&block_group->item)) *
8043 factor;
8044
8045 spin_unlock(&block_group->lock);
8046 }
8047
8048 return free_bytes;
8049}
8050
8051/*
8052 * helper to account the unused space of all the readonly block group in the
8053 * space_info. takes mirrors into account.
8054 */
8055u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8056{
8057 int i;
8058 u64 free_bytes = 0;
8059
8060 spin_lock(&sinfo->lock);
8061
8062 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8063 if (!list_empty(&sinfo->block_groups[i]))
8064 free_bytes += __btrfs_get_ro_block_group_free_space(
8065 &sinfo->block_groups[i]);
8066
8067 spin_unlock(&sinfo->lock);
8068
8069 return free_bytes;
8070}
8071
8015int btrfs_set_block_group_rw(struct btrfs_root *root, 8072int btrfs_set_block_group_rw(struct btrfs_root *root,
8016 struct btrfs_block_group_cache *cache) 8073 struct btrfs_block_group_cache *cache)
8017{ 8074{
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8092 mutex_lock(&root->fs_info->chunk_mutex); 8149 mutex_lock(&root->fs_info->chunk_mutex);
8093 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8150 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8094 u64 min_free = btrfs_block_group_used(&block_group->item); 8151 u64 min_free = btrfs_block_group_used(&block_group->item);
8095 u64 dev_offset, max_avail; 8152 u64 dev_offset;
8096 8153
8097 /* 8154 /*
8098 * check to make sure we can actually find a chunk with enough 8155 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8100 */ 8157 */
8101 if (device->total_bytes > device->bytes_used + min_free) { 8158 if (device->total_bytes > device->bytes_used + min_free) {
8102 ret = find_free_dev_extent(NULL, device, min_free, 8159 ret = find_free_dev_extent(NULL, device, min_free,
8103 &dev_offset, &max_avail); 8160 &dev_offset, NULL);
8104 if (!ret) 8161 if (!ret)
8105 break; 8162 break;
8106 ret = -1; 8163 ret = -1;
@@ -8584,3 +8641,14 @@ out:
8584 btrfs_free_path(path); 8641 btrfs_free_path(path);
8585 return ret; 8642 return ret;
8586} 8643}
8644
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{
8647 return unpin_extent_range(root, start, end);
8648}
8649
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes)
8652{
8653 return btrfs_discard_extent(root, bytenr, num_bytes);
8654}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2030 2030
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2033 2036
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3072#endif 3075#endif
3073 3076
3074 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3077 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078 if (eb == NULL)
3079 return NULL;
3075 eb->start = start; 3080 eb->start = start;
3076 eb->len = len; 3081 eb->len = len;
3077 spin_lock_init(&eb->lock); 3082 spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
136}; 140};
137 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
138struct extent_map_tree; 153struct extent_map_tree;
139 154
140static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
54 return em; 55 return em;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a9e0a4eaf3d9..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -225,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
225 225
226 split->bdev = em->bdev; 226 split->bdev = em->bdev;
227 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
228 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
229 BUG_ON(ret); 230 BUG_ON(ret);
230 free_extent_map(split); 231 free_extent_map(split);
@@ -239,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
239 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
240 split->bdev = em->bdev; 241 split->bdev = em->bdev;
241 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
242 244
243 if (compressed) { 245 if (compressed) {
244 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -891,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
891 if (err) 893 if (err)
892 goto out; 894 goto out;
893 895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
894 file_update_time(file); 907 file_update_time(file);
895 BTRFS_I(inode)->sequence++; 908 BTRFS_I(inode)->sequence++;
896 909
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 902afbf50811..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 122 size_t cur_size = size;
123 size_t datasize; 123 size_t datasize;
124 unsigned long offset; 124 unsigned long offset;
125 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
126 126
127 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
128 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 129 cur_size = compressed_size;
130 } 130 }
131 131
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
161 161
162 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 163 struct page *cpage;
164 int i = 0; 164 int i = 0;
165 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 176 compressed_size -= cur_size;
177 } 177 }
178 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 179 compress_type);
180 } else { 180 } else {
181 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
263 u64 compressed_size; 263 u64 compressed_size;
264 struct page **pages; 264 struct page **pages;
265 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
266 struct list_head list; 267 struct list_head list;
267}; 268};
268 269
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
281 u64 compressed_size, 282 u64 compressed_size,
282 struct page **pages, 283 struct page **pages,
283 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
284{ 286{
285 struct async_extent *async_extent; 287 struct async_extent *async_extent;
286 288
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
290 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 293 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 297 return 0;
295} 298}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
338 int compress_type = root->fs_info->compress_type;
335 339
336 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
337again: 341again:
@@ -381,12 +385,16 @@ again:
381 WARN_ON(pages); 385 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
383 387
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
385 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
386 nr_pages, &nr_pages_ret, 390
387 &total_in, 391 ret = btrfs_compress_pages(compress_type,
388 &total_compressed, 392 inode->i_mapping, start,
389 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
390 398
391 if (!ret) { 399 if (!ret) {
392 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
493 * and will submit them to the elevator. 501 * and will submit them to the elevator.
494 */ 502 */
495 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
497 506
498 if (start + num_bytes < end) { 507 if (start + num_bytes < end) {
499 start += num_bytes; 508 start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
517 } 526 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 529 *num_added += 1;
520 } 530 }
521 531
@@ -640,6 +650,7 @@ retry:
640 em->block_start = ins.objectid; 650 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 651 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 652 em->bdev = root->fs_info->fs_devices->latest_bdev;
653 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 654 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 656
@@ -656,11 +667,13 @@ retry:
656 async_extent->ram_size - 1, 0); 667 async_extent->ram_size - 1, 0);
657 } 668 }
658 669
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 670 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 671 async_extent->start,
661 async_extent->ram_size, 672 ins.objectid,
662 ins.offset, 673 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 674 ins.offset,
675 BTRFS_ORDERED_COMPRESSED,
676 async_extent->compress_type);
664 BUG_ON(ret); 677 BUG_ON(ret);
665 678
666 /* 679 /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1685 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1686 int compress_type = 0;
1674 int ret; 1687 int ret;
1675 bool nolock = false; 1688 bool nolock = false;
1676 1689
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1725
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1727 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1729 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1730 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1731 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1732 ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1740 ordered_extent->disk_len,
1728 ordered_extent->len, 1741 ordered_extent->len,
1729 ordered_extent->len, 1742 ordered_extent->len,
1730 compressed, 0, 0, 1743 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1744 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1746 ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1843 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1845 extent_set_compress_type(&failrec->bio_flags,
1846 em->compress_type);
1832 } 1847 }
1833 failrec->logical = logical; 1848 failrec->logical = logical;
1834 free_extent_map(em); 1849 free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3687{
3673 struct inode *inode = dentry->d_inode; 3688 struct inode *inode = dentry->d_inode;
3689 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3690 int err;
3675 3691
3692 if (btrfs_root_readonly(root))
3693 return -EROFS;
3694
3676 err = inode_change_ok(inode, attr); 3695 err = inode_change_ok(inode, attr);
3677 if (err) 3696 if (err)
3678 return err; 3697 return err;
@@ -4928,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4928 size_t max_size; 4947 size_t max_size;
4929 unsigned long inline_size; 4948 unsigned long inline_size;
4930 unsigned long ptr; 4949 unsigned long ptr;
4950 int compress_type;
4931 4951
4932 WARN_ON(pg_offset != 0); 4952 WARN_ON(pg_offset != 0);
4953 compress_type = btrfs_file_extent_compression(leaf, item);
4933 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4934 inline_size = btrfs_file_extent_inline_item_len(leaf, 4955 inline_size = btrfs_file_extent_inline_item_len(leaf,
4935 btrfs_item_nr(leaf, path->slots[0])); 4956 btrfs_item_nr(leaf, path->slots[0]));
@@ -4939,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4939 read_extent_buffer(leaf, tmp, ptr, inline_size); 4960 read_extent_buffer(leaf, tmp, ptr, inline_size);
4940 4961
4941 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4942 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4963 ret = btrfs_decompress(compress_type, tmp, page,
4943 inline_size, max_size); 4964 extent_offset, inline_size, max_size);
4944 if (ret) { 4965 if (ret) {
4945 char *kaddr = kmap_atomic(page, KM_USER0); 4966 char *kaddr = kmap_atomic(page, KM_USER0);
4946 unsigned long copy_size = min_t(u64, 4967 unsigned long copy_size = min_t(u64,
@@ -4982,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4982 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4983 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4984 struct btrfs_trans_handle *trans = NULL; 5005 struct btrfs_trans_handle *trans = NULL;
4985 int compressed; 5006 int compress_type;
4986 5007
4987again: 5008again:
4988 read_lock(&em_tree->lock); 5009 read_lock(&em_tree->lock);
@@ -5041,7 +5062,7 @@ again:
5041 5062
5042 found_type = btrfs_file_extent_type(leaf, item); 5063 found_type = btrfs_file_extent_type(leaf, item);
5043 extent_start = found_key.offset; 5064 extent_start = found_key.offset;
5044 compressed = btrfs_file_extent_compression(leaf, item); 5065 compress_type = btrfs_file_extent_compression(leaf, item);
5045 if (found_type == BTRFS_FILE_EXTENT_REG || 5066 if (found_type == BTRFS_FILE_EXTENT_REG ||
5046 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5047 extent_end = extent_start + 5068 extent_end = extent_start +
@@ -5087,8 +5108,9 @@ again:
5087 em->block_start = EXTENT_MAP_HOLE; 5108 em->block_start = EXTENT_MAP_HOLE;
5088 goto insert; 5109 goto insert;
5089 } 5110 }
5090 if (compressed) { 5111 if (compress_type != BTRFS_COMPRESS_NONE) {
5091 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5113 em->compress_type = compress_type;
5092 em->block_start = bytenr; 5114 em->block_start = bytenr;
5093 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5094 item); 5116 item);
@@ -5122,12 +5144,14 @@ again:
5122 em->len = (copy_size + root->sectorsize - 1) & 5144 em->len = (copy_size + root->sectorsize - 1) &
5123 ~((u64)root->sectorsize - 1); 5145 ~((u64)root->sectorsize - 1);
5124 em->orig_start = EXTENT_MAP_INLINE; 5146 em->orig_start = EXTENT_MAP_INLINE;
5125 if (compressed) 5147 if (compress_type) {
5126 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5149 em->compress_type = compress_type;
5150 }
5127 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5128 if (create == 0 && !PageUptodate(page)) { 5152 if (create == 0 && !PageUptodate(page)) {
5129 if (btrfs_file_extent_compression(leaf, item) == 5153 if (btrfs_file_extent_compression(leaf, item) !=
5130 BTRFS_COMPRESS_ZLIB) { 5154 BTRFS_COMPRESS_NONE) {
5131 ret = uncompress_inline(path, inode, page, 5155 ret = uncompress_inline(path, inode, page,
5132 pg_offset, 5156 pg_offset,
5133 extent_offset, item); 5157 extent_offset, item);
@@ -6477,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6477 ei->ordered_data_close = 0; 6501 ei->ordered_data_close = 0;
6478 ei->orphan_meta_reserved = 0; 6502 ei->orphan_meta_reserved = 0;
6479 ei->dummy_inode = 0; 6503 ei->dummy_inode = 0;
6480 ei->force_compress = 0; 6504 ei->force_compress = BTRFS_COMPRESS_NONE;
6481 6505
6482 inode = &ei->vfs_inode; 6506 inode = &ei->vfs_inode;
6483 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7105,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
7105 7129
7106static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7130static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7107{ 7131{
7132 struct btrfs_root *root = BTRFS_I(inode)->root;
7133
7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7135 return -EROFS;
7108 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7109 return -EACCES; 7137 return -EACCES;
7110 return generic_permission(inode, mask, flags, btrfs_check_acl); 7138 return generic_permission(inode, mask, flags, btrfs_check_acl);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
148 int ret; 148 int ret;
149 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
150 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 154 return -EFAULT;
152 155
@@ -360,7 +363,8 @@ fail:
360} 363}
361 364
362static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
363 char *name, int namelen, u64 *async_transid) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
364{ 368{
365 struct inode *inode; 369 struct inode *inode;
366 struct dentry *parent; 370 struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
378 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
380 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
381 386
382 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
383 if (IS_ERR(trans)) { 388 if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
509static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
510 char *name, int namelen, 515 char *name, int namelen,
511 struct btrfs_root *snap_src, 516 struct btrfs_root *snap_src,
512 u64 *async_transid) 517 u64 *async_transid, bool readonly)
513{ 518{
514 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
515 struct dentry *dentry; 520 struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
541 546
542 if (snap_src) { 547 if (snap_src) {
543 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
544 name, namelen, async_transid); 549 name, namelen, async_transid, readonly);
545 } else { 550 } else {
546 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
547 name, namelen, async_transid); 552 name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
638 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
639 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
640 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
641 unsigned long last_index; 647 unsigned long last_index;
642 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
643 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
644 u64 page_start; 651 u64 page_start;
645 u64 page_end; 652 u64 page_end;
646 u64 last_len = 0; 653 u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
648 u64 defrag_end = 0; 655 u64 defrag_end = 0;
649 unsigned long i; 656 unsigned long i;
650 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
659
660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
651 666
652 if (inode->i_size == 0) 667 if (inode->i_size == 0)
653 return 0; 668 return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
683 total_read++; 698 total_read++;
684 mutex_lock(&inode->i_mutex); 699 mutex_lock(&inode->i_mutex);
685 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
686 BTRFS_I(inode)->force_compress = 1; 701 BTRFS_I(inode)->force_compress = compress_type;
687 702
688 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
689 if (ret) 704 if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
781 atomic_dec(&root->fs_info->async_submit_draining); 796 atomic_dec(&root->fs_info->async_submit_draining);
782 797
783 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
784 BTRFS_I(inode)->force_compress = 0; 799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
785 mutex_unlock(&inode->i_mutex); 800 mutex_unlock(&inode->i_mutex);
786 } 801 }
787 802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
808 }
809
788 return 0; 810 return 0;
789 811
790err_reservations: 812err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
901 char *name, 923 char *name,
902 unsigned long fd, 924 unsigned long fd,
903 int subvol, 925 int subvol,
904 u64 *transid) 926 u64 *transid,
927 bool readonly)
905{ 928{
906 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 929 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
907 struct file *src_file; 930 struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
919 942
920 if (subvol) { 943 if (subvol) {
921 ret = btrfs_mksubvol(&file->f_path, name, namelen, 944 ret = btrfs_mksubvol(&file->f_path, name, namelen,
922 NULL, transid); 945 NULL, transid, readonly);
923 } else { 946 } else {
924 struct inode *src_inode; 947 struct inode *src_inode;
925 src_file = fget(fd); 948 src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
938 } 961 }
939 ret = btrfs_mksubvol(&file->f_path, name, namelen, 962 ret = btrfs_mksubvol(&file->f_path, name, namelen,
940 BTRFS_I(src_inode)->root, 963 BTRFS_I(src_inode)->root,
941 transid); 964 transid, readonly);
942 fput(src_file); 965 fput(src_file);
943 } 966 }
944out: 967out:
@@ -946,58 +969,139 @@ out:
946} 969}
947 970
948static noinline int btrfs_ioctl_snap_create(struct file *file, 971static noinline int btrfs_ioctl_snap_create(struct file *file,
949 void __user *arg, int subvol, 972 void __user *arg, int subvol)
950 int v2)
951{ 973{
952 struct btrfs_ioctl_vol_args *vol_args = NULL; 974 struct btrfs_ioctl_vol_args *vol_args;
953 struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
954 char *name;
955 u64 fd;
956 int ret; 975 int ret;
957 976
958 if (v2) { 977 vol_args = memdup_user(arg, sizeof(*vol_args));
959 u64 transid = 0; 978 if (IS_ERR(vol_args))
960 u64 *ptr = NULL; 979 return PTR_ERR(vol_args);
980 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
961 981
962 vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); 982 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
963 if (IS_ERR(vol_args_v2)) 983 vol_args->fd, subvol,
964 return PTR_ERR(vol_args_v2); 984 NULL, false);
965 985
966 if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { 986 kfree(vol_args);
967 ret = -EINVAL; 987 return ret;
968 goto out; 988}
969 }
970
971 name = vol_args_v2->name;
972 fd = vol_args_v2->fd;
973 vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
974 989
975 if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) 990static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
976 ptr = &transid; 991 void __user *arg, int subvol)
992{
993 struct btrfs_ioctl_vol_args_v2 *vol_args;
994 int ret;
995 u64 transid = 0;
996 u64 *ptr = NULL;
997 bool readonly = false;
977 998
978 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 999 vol_args = memdup_user(arg, sizeof(*vol_args));
979 subvol, ptr); 1000 if (IS_ERR(vol_args))
1001 return PTR_ERR(vol_args);
1002 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
980 1003
981 if (ret == 0 && ptr && 1004 if (vol_args->flags &
982 copy_to_user(arg + 1005 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
983 offsetof(struct btrfs_ioctl_vol_args_v2, 1006 ret = -EOPNOTSUPP;
984 transid), ptr, sizeof(*ptr))) 1007 goto out;
985 ret = -EFAULT;
986 } else {
987 vol_args = memdup_user(arg, sizeof(*vol_args));
988 if (IS_ERR(vol_args))
989 return PTR_ERR(vol_args);
990 name = vol_args->name;
991 fd = vol_args->fd;
992 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
993
994 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
995 subvol, NULL);
996 } 1008 }
1009
1010 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1011 ptr = &transid;
1012 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1013 readonly = true;
1014
1015 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1016 vol_args->fd, subvol,
1017 ptr, readonly);
1018
1019 if (ret == 0 && ptr &&
1020 copy_to_user(arg +
1021 offsetof(struct btrfs_ioctl_vol_args_v2,
1022 transid), ptr, sizeof(*ptr)))
1023 ret = -EFAULT;
997out: 1024out:
998 kfree(vol_args); 1025 kfree(vol_args);
999 kfree(vol_args_v2); 1026 return ret;
1027}
1000 1028
1029static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1030 void __user *arg)
1031{
1032 struct inode *inode = fdentry(file)->d_inode;
1033 struct btrfs_root *root = BTRFS_I(inode)->root;
1034 int ret = 0;
1035 u64 flags = 0;
1036
1037 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1038 return -EINVAL;
1039
1040 down_read(&root->fs_info->subvol_sem);
1041 if (btrfs_root_readonly(root))
1042 flags |= BTRFS_SUBVOL_RDONLY;
1043 up_read(&root->fs_info->subvol_sem);
1044
1045 if (copy_to_user(arg, &flags, sizeof(flags)))
1046 ret = -EFAULT;
1047
1048 return ret;
1049}
1050
1051static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1052 void __user *arg)
1053{
1054 struct inode *inode = fdentry(file)->d_inode;
1055 struct btrfs_root *root = BTRFS_I(inode)->root;
1056 struct btrfs_trans_handle *trans;
1057 u64 root_flags;
1058 u64 flags;
1059 int ret = 0;
1060
1061 if (root->fs_info->sb->s_flags & MS_RDONLY)
1062 return -EROFS;
1063
1064 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1065 return -EINVAL;
1066
1067 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT;
1069
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL;
1072
1073 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP;
1075
1076 down_write(&root->fs_info->subvol_sem);
1077
1078 /* nothing to do */
1079 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1080 goto out;
1081
1082 root_flags = btrfs_root_flags(&root->root_item);
1083 if (flags & BTRFS_SUBVOL_RDONLY)
1084 btrfs_set_root_flags(&root->root_item,
1085 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1086 else
1087 btrfs_set_root_flags(&root->root_item,
1088 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1089
1090 trans = btrfs_start_transaction(root, 1);
1091 if (IS_ERR(trans)) {
1092 ret = PTR_ERR(trans);
1093 goto out_reset;
1094 }
1095
1096 ret = btrfs_update_root(trans, root,
1097 &root->root_key, &root->root_item);
1098
1099 btrfs_commit_transaction(trans, root);
1100out_reset:
1101 if (ret)
1102 btrfs_set_root_flags(&root->root_item, root_flags);
1103out:
1104 up_write(&root->fs_info->subvol_sem);
1001 return ret; 1105 return ret;
1002} 1106}
1003 1107
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1509 struct btrfs_ioctl_defrag_range_args *range; 1613 struct btrfs_ioctl_defrag_range_args *range;
1510 int ret; 1614 int ret;
1511 1615
1616 if (btrfs_root_readonly(root))
1617 return -EROFS;
1618
1512 ret = mnt_want_write(file->f_path.mnt); 1619 ret = mnt_want_write(file->f_path.mnt);
1513 if (ret) 1620 if (ret)
1514 return ret; 1621 return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1637 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1744 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1638 return -EINVAL; 1745 return -EINVAL;
1639 1746
1747 if (btrfs_root_readonly(root))
1748 return -EROFS;
1749
1640 ret = mnt_want_write(file->f_path.mnt); 1750 ret = mnt_want_write(file->f_path.mnt);
1641 if (ret) 1751 if (ret)
1642 return ret; 1752 return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1958 if (file->private_data) 2068 if (file->private_data)
1959 goto out; 2069 goto out;
1960 2070
2071 ret = -EROFS;
2072 if (btrfs_root_readonly(root))
2073 goto out;
2074
1961 ret = mnt_want_write(file->f_path.mnt); 2075 ret = mnt_want_write(file->f_path.mnt);
1962 if (ret) 2076 if (ret)
1963 goto out; 2077 goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
2257 case FS_IOC_GETVERSION: 2371 case FS_IOC_GETVERSION:
2258 return btrfs_ioctl_getversion(file, argp); 2372 return btrfs_ioctl_getversion(file, argp);
2259 case BTRFS_IOC_SNAP_CREATE: 2373 case BTRFS_IOC_SNAP_CREATE:
2260 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2374 return btrfs_ioctl_snap_create(file, argp, 0);
2261 case BTRFS_IOC_SNAP_CREATE_V2: 2375 case BTRFS_IOC_SNAP_CREATE_V2:
2262 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2376 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2263 case BTRFS_IOC_SUBVOL_CREATE: 2377 case BTRFS_IOC_SUBVOL_CREATE:
2264 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2378 return btrfs_ioctl_snap_create(file, argp, 1);
2265 case BTRFS_IOC_SNAP_DESTROY: 2379 case BTRFS_IOC_SNAP_DESTROY:
2266 return btrfs_ioctl_snap_destroy(file, argp); 2380 return btrfs_ioctl_snap_destroy(file, argp);
2381 case BTRFS_IOC_SUBVOL_GETFLAGS:
2382 return btrfs_ioctl_subvol_getflags(file, argp);
2383 case BTRFS_IOC_SUBVOL_SETFLAGS:
2384 return btrfs_ioctl_subvol_setflags(file, argp);
2267 case BTRFS_IOC_DEFAULT_SUBVOL: 2385 case BTRFS_IOC_DEFAULT_SUBVOL:
2268 return btrfs_ioctl_default_subvol(file, argp); 2386 return btrfs_ioctl_default_subvol(file, argp);
2269 case BTRFS_IOC_DEFRAG: 2387 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
34 35
35#define BTRFS_SUBVOL_NAME_MAX 4039 36#define BTRFS_SUBVOL_NAME_MAX 4039
36struct btrfs_ioctl_vol_args_v2 { 37struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
133 */ 134 */
134 __u32 extent_thresh; 135 __u32 extent_thresh;
135 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
136 /* spare for later */ 144 /* spare for later */
137 __u32 unused[5]; 145 __u32 unused[4];
138}; 146};
139 147
140struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
193#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
194#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
195 struct btrfs_ioctl_vol_args_v2) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
196#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 221 u64 start, u64 len, u64 disk_len, int type)
221{ 222{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
224} 226}
225 227
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 229 u64 start, u64 len, u64 disk_len, int type)
228{ 230{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
231} 243}
232 244
233/* 245/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
148 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
149int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
150 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
151int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
152 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
153 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 22acdaa78ce1..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
54 54
55static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
56 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
57static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
58{ 142{
59 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
75}; 159};
76 160
77static match_table_t tokens = { 161static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 198 char *p, *num, *orig;
113 int intarg; 199 int intarg;
114 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
115 203
116 if (!options) 204 if (!options)
117 return 0; 205 return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 244 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 245 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
164 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
165 break; 271 break;
166 case Opt_ssd: 272 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -753,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
753 return 0; 859 return 0;
754} 860}
755 861
862/*
863 * The helper to calc the free space on the devices that can be used to store
864 * file data.
865 */
866static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
867{
868 struct btrfs_fs_info *fs_info = root->fs_info;
869 struct btrfs_device_info *devices_info;
870 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
871 struct btrfs_device *device;
872 u64 skip_space;
873 u64 type;
874 u64 avail_space;
875 u64 used_space;
876 u64 min_stripe_size;
877 int min_stripes = 1;
878 int i = 0, nr_devices;
879 int ret;
880
881 nr_devices = fs_info->fs_devices->rw_devices;
882 BUG_ON(!nr_devices);
883
884 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
885 GFP_NOFS);
886 if (!devices_info)
887 return -ENOMEM;
888
889 /* calc min stripe number for data space alloction */
890 type = btrfs_get_alloc_profile(root, 1);
891 if (type & BTRFS_BLOCK_GROUP_RAID0)
892 min_stripes = 2;
893 else if (type & BTRFS_BLOCK_GROUP_RAID1)
894 min_stripes = 2;
895 else if (type & BTRFS_BLOCK_GROUP_RAID10)
896 min_stripes = 4;
897
898 if (type & BTRFS_BLOCK_GROUP_DUP)
899 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
900 else
901 min_stripe_size = BTRFS_STRIPE_LEN;
902
903 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
904 if (!device->in_fs_metadata)
905 continue;
906
907 avail_space = device->total_bytes - device->bytes_used;
908
909 /* align with stripe_len */
910 do_div(avail_space, BTRFS_STRIPE_LEN);
911 avail_space *= BTRFS_STRIPE_LEN;
912
913 /*
914 * In order to avoid overwritting the superblock on the drive,
915 * btrfs starts at an offset of at least 1MB when doing chunk
916 * allocation.
917 */
918 skip_space = 1024 * 1024;
919
920 /* user can set the offset in fs_info->alloc_start. */
921 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
922 device->total_bytes)
923 skip_space = max(fs_info->alloc_start, skip_space);
924
925 /*
926 * btrfs can not use the free space in [0, skip_space - 1],
927 * we must subtract it from the total. In order to implement
928 * it, we account the used space in this range first.
929 */
930 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
931 &used_space);
932 if (ret) {
933 kfree(devices_info);
934 return ret;
935 }
936
937 /* calc the free space in [0, skip_space - 1] */
938 skip_space -= used_space;
939
940 /*
941 * we can use the free space in [0, skip_space - 1], subtract
942 * it from the total.
943 */
944 if (avail_space && avail_space >= skip_space)
945 avail_space -= skip_space;
946 else
947 avail_space = 0;
948
949 if (avail_space < min_stripe_size)
950 continue;
951
952 devices_info[i].dev = device;
953 devices_info[i].max_avail = avail_space;
954
955 i++;
956 }
957
958 nr_devices = i;
959
960 btrfs_descending_sort_devices(devices_info, nr_devices);
961
962 i = nr_devices - 1;
963 avail_space = 0;
964 while (nr_devices >= min_stripes) {
965 if (devices_info[i].max_avail >= min_stripe_size) {
966 int j;
967 u64 alloc_size;
968
969 avail_space += devices_info[i].max_avail * min_stripes;
970 alloc_size = devices_info[i].max_avail;
971 for (j = i + 1 - min_stripes; j <= i; j++)
972 devices_info[j].max_avail -= alloc_size;
973 }
974 i--;
975 nr_devices--;
976 }
977
978 kfree(devices_info);
979 *free_bytes = avail_space;
980 return 0;
981}
982
756static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 983static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
757{ 984{
758 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 985 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -760,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
760 struct list_head *head = &root->fs_info->space_info; 987 struct list_head *head = &root->fs_info->space_info;
761 struct btrfs_space_info *found; 988 struct btrfs_space_info *found;
762 u64 total_used = 0; 989 u64 total_used = 0;
763 u64 total_used_data = 0; 990 u64 total_free_data = 0;
764 int bits = dentry->d_sb->s_blocksize_bits; 991 int bits = dentry->d_sb->s_blocksize_bits;
765 __be32 *fsid = (__be32 *)root->fs_info->fsid; 992 __be32 *fsid = (__be32 *)root->fs_info->fsid;
993 int ret;
766 994
995 /* holding chunk_muext to avoid allocating new chunks */
996 mutex_lock(&root->fs_info->chunk_mutex);
767 rcu_read_lock(); 997 rcu_read_lock();
768 list_for_each_entry_rcu(found, head, list) { 998 list_for_each_entry_rcu(found, head, list) {
769 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 999 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
770 BTRFS_BLOCK_GROUP_SYSTEM)) 1000 total_free_data += found->disk_total - found->disk_used;
771 total_used_data += found->disk_total; 1001 total_free_data -=
772 else 1002 btrfs_account_ro_block_groups_free_space(found);
773 total_used_data += found->disk_used; 1003 }
1004
774 total_used += found->disk_used; 1005 total_used += found->disk_used;
775 } 1006 }
776 rcu_read_unlock(); 1007 rcu_read_unlock();
@@ -778,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
778 buf->f_namelen = BTRFS_NAME_LEN; 1009 buf->f_namelen = BTRFS_NAME_LEN;
779 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1010 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
780 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1011 buf->f_bfree = buf->f_blocks - (total_used >> bits);
781 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
782 buf->f_bsize = dentry->d_sb->s_blocksize; 1012 buf->f_bsize = dentry->d_sb->s_blocksize;
783 buf->f_type = BTRFS_SUPER_MAGIC; 1013 buf->f_type = BTRFS_SUPER_MAGIC;
1014 buf->f_bavail = total_free_data;
1015 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1016 if (ret) {
1017 mutex_unlock(&root->fs_info->chunk_mutex);
1018 return ret;
1019 }
1020 buf->f_bavail += total_free_data;
1021 buf->f_bavail = buf->f_bavail >> bits;
1022 mutex_unlock(&root->fs_info->chunk_mutex);
784 1023
785 /* We treat it as constant endianness (it doesn't matter _which_) 1024 /* We treat it as constant endianness (it doesn't matter _which_)
786 because we want the fsid to come out the same whether mounted 1025 because we want the fsid to come out the same whether mounted
@@ -897,10 +1136,14 @@ static int __init init_btrfs_fs(void)
897 if (err) 1136 if (err)
898 return err; 1137 return err;
899 1138
900 err = btrfs_init_cachep(); 1139 err = btrfs_init_compress();
901 if (err) 1140 if (err)
902 goto free_sysfs; 1141 goto free_sysfs;
903 1142
1143 err = btrfs_init_cachep();
1144 if (err)
1145 goto free_compress;
1146
904 err = extent_io_init(); 1147 err = extent_io_init();
905 if (err) 1148 if (err)
906 goto free_cachep; 1149 goto free_cachep;
@@ -928,6 +1171,8 @@ free_extent_io:
928 extent_io_exit(); 1171 extent_io_exit();
929free_cachep: 1172free_cachep:
930 btrfs_destroy_cachep(); 1173 btrfs_destroy_cachep();
1174free_compress:
1175 btrfs_exit_compress();
931free_sysfs: 1176free_sysfs:
932 btrfs_exit_sysfs(); 1177 btrfs_exit_sysfs();
933 return err; 1178 return err;
@@ -942,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
942 unregister_filesystem(&btrfs_fs_type); 1187 unregister_filesystem(&btrfs_fs_type);
943 btrfs_exit_sysfs(); 1188 btrfs_exit_sysfs();
944 btrfs_cleanup_fs_uuids(); 1189 btrfs_cleanup_fs_uuids();
945 btrfs_zlib_exit(); 1190 btrfs_exit_compress();
946} 1191}
947 1192
948module_init(init_btrfs_fs) 1193module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
910 u64 to_reserve = 0; 913 u64 to_reserve = 0;
911 u64 index = 0; 914 u64 index = 0;
912 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
913 917
914 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
915 if (!new_root_item) { 919 if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
968 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
969 973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
980
970 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
971 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
972 btrfs_set_lock_blocking(old); 983 btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1718e1a5c320..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -600,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
600 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
601 602
602 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
603 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
604 goto error_close; 606 goto error_close;
607 }
605 608
606 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
607 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -703,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
703 goto error_close; 706 goto error_close;
704 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
705 if (!bh) { 708 if (!bh) {
706 ret = -EIO; 709 ret = -EINVAL;
707 goto error_close; 710 goto error_close;
708 } 711 }
709 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -729,59 +732,167 @@ error:
729 return ret; 732 return ret;
730} 733}
731 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
732/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
733 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
734 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
735 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
736 */ 839 */
737int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
738 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
739 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
740{ 843{
741 struct btrfs_key key; 844 struct btrfs_key key;
742 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
743 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
744 struct btrfs_path *path; 847 struct btrfs_path *path;
745 u64 hole_size = 0; 848 u64 hole_size;
746 u64 last_byte = 0; 849 u64 max_hole_start;
747 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
748 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
749 int ret; 854 int ret;
750 int slot = 0; 855 int slot;
751 int start_found;
752 struct extent_buffer *l; 856 struct extent_buffer *l;
753 857
754 path = btrfs_alloc_path();
755 if (!path)
756 return -ENOMEM;
757 path->reada = 2;
758 start_found = 0;
759
760 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
761 859
762 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
763 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
764 */ 862 */
765 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
766 864
767 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
768 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
769 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
770 key.objectid = device->devid; 883 key.objectid = device->devid;
771 key.offset = search_start; 884 key.offset = search_start;
772 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
773 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
774 if (ret < 0) 888 if (ret < 0)
775 goto error; 889 goto out;
776 if (ret > 0) { 890 if (ret > 0) {
777 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
778 if (ret < 0) 892 if (ret < 0)
779 goto error; 893 goto out;
780 if (ret > 0)
781 start_found = 1;
782 } 894 }
783 l = path->nodes[0]; 895
784 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
785 while (1) { 896 while (1) {
786 l = path->nodes[0]; 897 l = path->nodes[0];
787 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -790,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
790 if (ret == 0) 901 if (ret == 0)
791 continue; 902 continue;
792 if (ret < 0) 903 if (ret < 0)
793 goto error; 904 goto out;
794no_more_items: 905
795 if (!start_found) { 906 break;
796 if (search_start >= search_end) {
797 ret = -ENOSPC;
798 goto error;
799 }
800 *start = search_start;
801 start_found = 1;
802 goto check_pending;
803 }
804 *start = last_byte > search_start ?
805 last_byte : search_start;
806 if (search_end <= *start) {
807 ret = -ENOSPC;
808 goto error;
809 }
810 goto check_pending;
811 } 907 }
812 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
813 909
@@ -815,48 +911,62 @@ no_more_items:
815 goto next; 911 goto next;
816 912
817 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
818 goto no_more_items; 914 break;
819 915
820 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
821 start_found) { 917 goto next;
822 if (last_byte < search_start)
823 last_byte = search_start;
824 hole_size = key.offset - last_byte;
825 918
826 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
827 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
828 921
829 if (key.offset > last_byte && 922 if (hole_size > max_hole_size) {
830 hole_size >= num_bytes) { 923 max_hole_start = search_start;
831 *start = last_byte; 924 max_hole_size = hole_size;
832 goto check_pending; 925 }
926
927 /*
928 * If this free space is greater than which we need,
929 * it must be the max free space that we have found
930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
833 } 939 }
834 } 940 }
835 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
836 goto next;
837 941
838 start_found = 1;
839 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
840 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
841next: 947next:
842 path->slots[0]++; 948 path->slots[0]++;
843 cond_resched(); 949 cond_resched();
844 } 950 }
845check_pending:
846 /* we have to make sure we didn't find an extent that has already
847 * been allocated by the map tree or the original allocation
848 */
849 BUG_ON(*start < search_start);
850 951
851 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
852 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
853 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
854 } 956 }
855 /* check for pending inserts here */
856 ret = 0;
857 957
858error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
859 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
860 return ret; 970 return ret;
861} 971}
862 972
@@ -1196,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1196 set_blocksize(bdev, 4096); 1306 set_blocksize(bdev, 4096);
1197 bh = btrfs_read_dev_super(bdev); 1307 bh = btrfs_read_dev_super(bdev);
1198 if (!bh) { 1308 if (!bh) {
1199 ret = -EIO; 1309 ret = -EINVAL;
1200 goto error_close; 1310 goto error_close;
1201 } 1311 }
1202 disk_super = (struct btrfs_super_block *)bh->b_data; 1312 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1916,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1916 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2026 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1917 return -EROFS; 2027 return -EROFS;
1918 2028
2029 if (!capable(CAP_SYS_ADMIN))
2030 return -EPERM;
2031
1919 mutex_lock(&dev_root->fs_info->volume_mutex); 2032 mutex_lock(&dev_root->fs_info->volume_mutex);
1920 dev_root = dev_root->fs_info->dev_root; 2033 dev_root = dev_root->fs_info->dev_root;
1921 2034
@@ -2154,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2154 return calc_size * num_stripes; 2267 return calc_size * num_stripes;
2155} 2268}
2156 2269
2157static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2270/* Used to sort the devices by max_avail(descending sort) */
2158 struct btrfs_root *extent_root, 2271int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2159 struct map_lookup **map_ret,
2160 u64 *num_bytes, u64 *stripe_size,
2161 u64 start, u64 type)
2162{ 2272{
2163 struct btrfs_fs_info *info = extent_root->fs_info; 2273 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2164 struct btrfs_device *device = NULL; 2274 ((struct btrfs_device_info *)dev_info2)->max_avail)
2165 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2275 return -1;
2166 struct list_head *cur; 2276 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2167 struct map_lookup *map = NULL; 2277 ((struct btrfs_device_info *)dev_info2)->max_avail)
2168 struct extent_map_tree *em_tree; 2278 return 1;
2169 struct extent_map *em; 2279 else
2170 struct list_head private_devs; 2280 return 0;
2171 int min_stripe_size = 1 * 1024 * 1024; 2281}
2172 u64 calc_size = 1024 * 1024 * 1024;
2173 u64 max_chunk_size = calc_size;
2174 u64 min_free;
2175 u64 avail;
2176 u64 max_avail = 0;
2177 u64 dev_offset;
2178 int num_stripes = 1;
2179 int min_stripes = 1;
2180 int sub_stripes = 0;
2181 int looped = 0;
2182 int ret;
2183 int index;
2184 int stripe_len = 64 * 1024;
2185 2282
2186 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2283static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2187 (type & BTRFS_BLOCK_GROUP_DUP)) { 2284 int *num_stripes, int *min_stripes,
2188 WARN_ON(1); 2285 int *sub_stripes)
2189 type &= ~BTRFS_BLOCK_GROUP_DUP; 2286{
2190 } 2287 *num_stripes = 1;
2191 if (list_empty(&fs_devices->alloc_list)) 2288 *min_stripes = 1;
2192 return -ENOSPC; 2289 *sub_stripes = 0;
2193 2290
2194 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2291 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2195 num_stripes = fs_devices->rw_devices; 2292 *num_stripes = fs_devices->rw_devices;
2196 min_stripes = 2; 2293 *min_stripes = 2;
2197 } 2294 }
2198 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2295 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2199 num_stripes = 2; 2296 *num_stripes = 2;
2200 min_stripes = 2; 2297 *min_stripes = 2;
2201 } 2298 }
2202 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2299 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2203 if (fs_devices->rw_devices < 2) 2300 if (fs_devices->rw_devices < 2)
2204 return -ENOSPC; 2301 return -ENOSPC;
2205 num_stripes = 2; 2302 *num_stripes = 2;
2206 min_stripes = 2; 2303 *min_stripes = 2;
2207 } 2304 }
2208 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2305 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2209 num_stripes = fs_devices->rw_devices; 2306 *num_stripes = fs_devices->rw_devices;
2210 if (num_stripes < 4) 2307 if (*num_stripes < 4)
2211 return -ENOSPC; 2308 return -ENOSPC;
2212 num_stripes &= ~(u32)1; 2309 *num_stripes &= ~(u32)1;
2213 sub_stripes = 2; 2310 *sub_stripes = 2;
2214 min_stripes = 4; 2311 *min_stripes = 4;
2215 } 2312 }
2216 2313
2314 return 0;
2315}
2316
2317static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2318 u64 proposed_size, u64 type,
2319 int num_stripes, int small_stripe)
2320{
2321 int min_stripe_size = 1 * 1024 * 1024;
2322 u64 calc_size = proposed_size;
2323 u64 max_chunk_size = calc_size;
2324 int ncopies = 1;
2325
2326 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2327 BTRFS_BLOCK_GROUP_DUP |
2328 BTRFS_BLOCK_GROUP_RAID10))
2329 ncopies = 2;
2330
2217 if (type & BTRFS_BLOCK_GROUP_DATA) { 2331 if (type & BTRFS_BLOCK_GROUP_DATA) {
2218 max_chunk_size = 10 * calc_size; 2332 max_chunk_size = 10 * calc_size;
2219 min_stripe_size = 64 * 1024 * 1024; 2333 min_stripe_size = 64 * 1024 * 1024;
@@ -2230,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2230 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2344 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2231 max_chunk_size); 2345 max_chunk_size);
2232 2346
2233again: 2347 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2234 max_avail = 0; 2348 calc_size = max_chunk_size * ncopies;
2235 if (!map || map->num_stripes != num_stripes) {
2236 kfree(map);
2237 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2238 if (!map)
2239 return -ENOMEM;
2240 map->num_stripes = num_stripes;
2241 }
2242
2243 if (calc_size * num_stripes > max_chunk_size) {
2244 calc_size = max_chunk_size;
2245 do_div(calc_size, num_stripes); 2349 do_div(calc_size, num_stripes);
2246 do_div(calc_size, stripe_len); 2350 do_div(calc_size, BTRFS_STRIPE_LEN);
2247 calc_size *= stripe_len; 2351 calc_size *= BTRFS_STRIPE_LEN;
2248 } 2352 }
2249 2353
2250 /* we don't want tiny stripes */ 2354 /* we don't want tiny stripes */
2251 if (!looped) 2355 if (!small_stripe)
2252 calc_size = max_t(u64, min_stripe_size, calc_size); 2356 calc_size = max_t(u64, min_stripe_size, calc_size);
2253 2357
2254 /* 2358 /*
2255 * we're about to do_div by the stripe_len so lets make sure 2359 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2256 * we end up with something bigger than a stripe 2360 * we end up with something bigger than a stripe
2257 */ 2361 */
2258 calc_size = max_t(u64, calc_size, stripe_len * 4); 2362 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2363
2364 do_div(calc_size, BTRFS_STRIPE_LEN);
2365 calc_size *= BTRFS_STRIPE_LEN;
2366
2367 return calc_size;
2368}
2369
2370static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2371 int num_stripes)
2372{
2373 struct map_lookup *new;
2374 size_t len = map_lookup_size(num_stripes);
2375
2376 BUG_ON(map->num_stripes < num_stripes);
2377
2378 if (map->num_stripes == num_stripes)
2379 return map;
2380
2381 new = kmalloc(len, GFP_NOFS);
2382 if (!new) {
2383 /* just change map->num_stripes */
2384 map->num_stripes = num_stripes;
2385 return map;
2386 }
2387
2388 memcpy(new, map, len);
2389 new->num_stripes = num_stripes;
2390 kfree(map);
2391 return new;
2392}
2393
2394/*
2395 * helper to allocate device space from btrfs_device_info, in which we stored
2396 * max free space information of every device. It is used when we can not
2397 * allocate chunks by default size.
2398 *
2399 * By this helper, we can allocate a new chunk as larger as possible.
2400 */
2401static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2402 struct btrfs_fs_devices *fs_devices,
2403 struct btrfs_device_info *devices,
2404 int nr_device, u64 type,
2405 struct map_lookup **map_lookup,
2406 int min_stripes, u64 *stripe_size)
2407{
2408 int i, index, sort_again = 0;
2409 int min_devices = min_stripes;
2410 u64 max_avail, min_free;
2411 struct map_lookup *map = *map_lookup;
2412 int ret;
2413
2414 if (nr_device < min_stripes)
2415 return -ENOSPC;
2416
2417 btrfs_descending_sort_devices(devices, nr_device);
2418
2419 max_avail = devices[0].max_avail;
2420 if (!max_avail)
2421 return -ENOSPC;
2422
2423 for (i = 0; i < nr_device; i++) {
2424 /*
2425 * if dev_offset = 0, it means the free space of this device
2426 * is less than what we need, and we didn't search max avail
2427 * extent on this device, so do it now.
2428 */
2429 if (!devices[i].dev_offset) {
2430 ret = find_free_dev_extent(trans, devices[i].dev,
2431 max_avail,
2432 &devices[i].dev_offset,
2433 &devices[i].max_avail);
2434 if (ret != 0 && ret != -ENOSPC)
2435 return ret;
2436 sort_again = 1;
2437 }
2438 }
2439
2440 /* we update the max avail free extent of each devices, sort again */
2441 if (sort_again)
2442 btrfs_descending_sort_devices(devices, nr_device);
2443
2444 if (type & BTRFS_BLOCK_GROUP_DUP)
2445 min_devices = 1;
2446
2447 if (!devices[min_devices - 1].max_avail)
2448 return -ENOSPC;
2449
2450 max_avail = devices[min_devices - 1].max_avail;
2451 if (type & BTRFS_BLOCK_GROUP_DUP)
2452 do_div(max_avail, 2);
2453
2454 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2455 min_stripes, 1);
2456 if (type & BTRFS_BLOCK_GROUP_DUP)
2457 min_free = max_avail * 2;
2458 else
2459 min_free = max_avail;
2460
2461 if (min_free > devices[min_devices - 1].max_avail)
2462 return -ENOSPC;
2463
2464 map = __shrink_map_lookup_stripes(map, min_stripes);
2465 *stripe_size = max_avail;
2466
2467 index = 0;
2468 for (i = 0; i < min_stripes; i++) {
2469 map->stripes[i].dev = devices[index].dev;
2470 map->stripes[i].physical = devices[index].dev_offset;
2471 if (type & BTRFS_BLOCK_GROUP_DUP) {
2472 i++;
2473 map->stripes[i].dev = devices[index].dev;
2474 map->stripes[i].physical = devices[index].dev_offset +
2475 max_avail;
2476 }
2477 index++;
2478 }
2479 *map_lookup = map;
2480
2481 return 0;
2482}
2259 2483
2260 do_div(calc_size, stripe_len); 2484static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2261 calc_size *= stripe_len; 2485 struct btrfs_root *extent_root,
2486 struct map_lookup **map_ret,
2487 u64 *num_bytes, u64 *stripe_size,
2488 u64 start, u64 type)
2489{
2490 struct btrfs_fs_info *info = extent_root->fs_info;
2491 struct btrfs_device *device = NULL;
2492 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2493 struct list_head *cur;
2494 struct map_lookup *map;
2495 struct extent_map_tree *em_tree;
2496 struct extent_map *em;
2497 struct btrfs_device_info *devices_info;
2498 struct list_head private_devs;
2499 u64 calc_size = 1024 * 1024 * 1024;
2500 u64 min_free;
2501 u64 avail;
2502 u64 dev_offset;
2503 int num_stripes;
2504 int min_stripes;
2505 int sub_stripes;
2506 int min_devices; /* the min number of devices we need */
2507 int i;
2508 int ret;
2509 int index;
2510
2511 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2512 (type & BTRFS_BLOCK_GROUP_DUP)) {
2513 WARN_ON(1);
2514 type &= ~BTRFS_BLOCK_GROUP_DUP;
2515 }
2516 if (list_empty(&fs_devices->alloc_list))
2517 return -ENOSPC;
2518
2519 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2520 &min_stripes, &sub_stripes);
2521 if (ret)
2522 return ret;
2523
2524 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2525 GFP_NOFS);
2526 if (!devices_info)
2527 return -ENOMEM;
2528
2529 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2530 if (!map) {
2531 ret = -ENOMEM;
2532 goto error;
2533 }
2534 map->num_stripes = num_stripes;
2262 2535
2263 cur = fs_devices->alloc_list.next; 2536 cur = fs_devices->alloc_list.next;
2264 index = 0; 2537 index = 0;
2538 i = 0;
2265 2539
2266 if (type & BTRFS_BLOCK_GROUP_DUP) 2540 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2541 num_stripes, 0);
2542
2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2267 min_free = calc_size * 2; 2544 min_free = calc_size * 2;
2268 else 2545 min_devices = 1;
2546 } else {
2269 min_free = calc_size; 2547 min_free = calc_size;
2270 2548 min_devices = min_stripes;
2271 /* 2549 }
2272 * we add 1MB because we never use the first 1MB of the device, unless
2273 * we've looped, then we are likely allocating the maximum amount of
2274 * space left already
2275 */
2276 if (!looped)
2277 min_free += 1024 * 1024;
2278 2550
2279 INIT_LIST_HEAD(&private_devs); 2551 INIT_LIST_HEAD(&private_devs);
2280 while (index < num_stripes) { 2552 while (index < num_stripes) {
@@ -2287,27 +2559,39 @@ again:
2287 cur = cur->next; 2559 cur = cur->next;
2288 2560
2289 if (device->in_fs_metadata && avail >= min_free) { 2561 if (device->in_fs_metadata && avail >= min_free) {
2290 ret = find_free_dev_extent(trans, device, 2562 ret = find_free_dev_extent(trans, device, min_free,
2291 min_free, &dev_offset, 2563 &devices_info[i].dev_offset,
2292 &max_avail); 2564 &devices_info[i].max_avail);
2293 if (ret == 0) { 2565 if (ret == 0) {
2294 list_move_tail(&device->dev_alloc_list, 2566 list_move_tail(&device->dev_alloc_list,
2295 &private_devs); 2567 &private_devs);
2296 map->stripes[index].dev = device; 2568 map->stripes[index].dev = device;
2297 map->stripes[index].physical = dev_offset; 2569 map->stripes[index].physical =
2570 devices_info[i].dev_offset;
2298 index++; 2571 index++;
2299 if (type & BTRFS_BLOCK_GROUP_DUP) { 2572 if (type & BTRFS_BLOCK_GROUP_DUP) {
2300 map->stripes[index].dev = device; 2573 map->stripes[index].dev = device;
2301 map->stripes[index].physical = 2574 map->stripes[index].physical =
2302 dev_offset + calc_size; 2575 devices_info[i].dev_offset +
2576 calc_size;
2303 index++; 2577 index++;
2304 } 2578 }
2305 } 2579 } else if (ret != -ENOSPC)
2306 } else if (device->in_fs_metadata && avail > max_avail) 2580 goto error;
2307 max_avail = avail; 2581
2582 devices_info[i].dev = device;
2583 i++;
2584 } else if (device->in_fs_metadata &&
2585 avail >= BTRFS_STRIPE_LEN) {
2586 devices_info[i].dev = device;
2587 devices_info[i].max_avail = avail;
2588 i++;
2589 }
2590
2308 if (cur == &fs_devices->alloc_list) 2591 if (cur == &fs_devices->alloc_list)
2309 break; 2592 break;
2310 } 2593 }
2594
2311 list_splice(&private_devs, &fs_devices->alloc_list); 2595 list_splice(&private_devs, &fs_devices->alloc_list);
2312 if (index < num_stripes) { 2596 if (index < num_stripes) {
2313 if (index >= min_stripes) { 2597 if (index >= min_stripes) {
@@ -2316,34 +2600,36 @@ again:
2316 num_stripes /= sub_stripes; 2600 num_stripes /= sub_stripes;
2317 num_stripes *= sub_stripes; 2601 num_stripes *= sub_stripes;
2318 } 2602 }
2319 looped = 1; 2603
2320 goto again; 2604 map = __shrink_map_lookup_stripes(map, num_stripes);
2321 } 2605 } else if (i >= min_devices) {
2322 if (!looped && max_avail > 0) { 2606 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2323 looped = 1; 2607 devices_info, i, type,
2324 calc_size = max_avail; 2608 &map, min_stripes,
2325 goto again; 2609 &calc_size);
2610 if (ret)
2611 goto error;
2612 } else {
2613 ret = -ENOSPC;
2614 goto error;
2326 } 2615 }
2327 kfree(map);
2328 return -ENOSPC;
2329 } 2616 }
2330 map->sector_size = extent_root->sectorsize; 2617 map->sector_size = extent_root->sectorsize;
2331 map->stripe_len = stripe_len; 2618 map->stripe_len = BTRFS_STRIPE_LEN;
2332 map->io_align = stripe_len; 2619 map->io_align = BTRFS_STRIPE_LEN;
2333 map->io_width = stripe_len; 2620 map->io_width = BTRFS_STRIPE_LEN;
2334 map->type = type; 2621 map->type = type;
2335 map->num_stripes = num_stripes;
2336 map->sub_stripes = sub_stripes; 2622 map->sub_stripes = sub_stripes;
2337 2623
2338 *map_ret = map; 2624 *map_ret = map;
2339 *stripe_size = calc_size; 2625 *stripe_size = calc_size;
2340 *num_bytes = chunk_bytes_by_type(type, calc_size, 2626 *num_bytes = chunk_bytes_by_type(type, calc_size,
2341 num_stripes, sub_stripes); 2627 map->num_stripes, sub_stripes);
2342 2628
2343 em = alloc_extent_map(GFP_NOFS); 2629 em = alloc_extent_map(GFP_NOFS);
2344 if (!em) { 2630 if (!em) {
2345 kfree(map); 2631 ret = -ENOMEM;
2346 return -ENOMEM; 2632 goto error;
2347 } 2633 }
2348 em->bdev = (struct block_device *)map; 2634 em->bdev = (struct block_device *)map;
2349 em->start = start; 2635 em->start = start;
@@ -2376,7 +2662,13 @@ again:
2376 index++; 2662 index++;
2377 } 2663 }
2378 2664
2665 kfree(devices_info);
2379 return 0; 2666 return 0;
2667
2668error:
2669 kfree(map);
2670 kfree(devices_info);
2671 return ret;
2380} 2672}
2381 2673
2382static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2674static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1be781079450..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
136 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
137}; 140};
138 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
139#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
140 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
141 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 317 size_t size, int flags)
318{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
319 /* 328 /*
320 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 345
337int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
339 /* 357 /*
340 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 95 *total_out = 0;
206 *total_in = 0; 96 *total_in = 0;
207 97
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 100 ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 108 data_in = kmap(in_page);
223 109
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
225 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
226 pages[0] = out_page; 116 pages[0] = out_page;
227 nr_pages = 1; 117 nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 150 goto out;
261 } 151 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
263 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
265 nr_pages++; 159 nr_pages++;
@@ -314,55 +208,26 @@ out:
314 kunmap(in_page); 208 kunmap(in_page);
315 page_cache_release(in_page); 209 page_cache_release(in_page);
316 } 210 }
317 free_workspace(workspace);
318 return ret; 211 return ret;
319} 212}
320 213
321/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
323 * 216 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
325 * 218 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 219{
343 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 223 char *data_in;
347 size_t total_out = 0; 224 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 229 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 230 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 231
367 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 240 pg_offset = 0;
378 241
379 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 252
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 255 return -1;
393 goto out;
394 } 256 }
395 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 260 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 261
405 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
407 264
408 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
409 266 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 267 break;
423 }
424 268
425 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 270 total_out, disk_start,
427 goto next; 271 bvec, vcnt,
428 272 &page_out_index, &pg_offset);
429 /* 273 if (ret2 == 0) {
430 * the start of the data we care about is offset into 274 ret = 0;
431 * the middle of our working buffer 275 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 276 }
492next: 277
493 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 280
@@ -516,35 +301,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 302 if (data_in)
518 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 304 return ret;
522} 305}
523 306
524/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 313 int ret = 0;
535 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 316 unsigned long total_out = 0;
539 char *kaddr; 317 char *kaddr;
540 318
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 336
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 339 return -1;
569 goto out;
570 } 340 }
571 341
572 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
616 ret = 0; 386 ret = 0;
617 387
618 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 389 return ret;
622} 390}
623 391
624void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
625{ 393 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
627} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};