aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/compression.c454
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/ctree.h99
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c411
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c9
-rw-r--r--fs/btrfs/extent_map.h6
-rw-r--r--fs/btrfs/file-item.c75
-rw-r--r--fs/btrfs/file.c263
-rw-r--r--fs/btrfs/inode.c584
-rw-r--r--fs/btrfs/ordered-data.c9
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c637
22 files changed, 2315 insertions, 379 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c..31cce5d88b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
501 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" 501 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
502 depends on EXPERIMENTAL 502 depends on EXPERIMENTAL
503 select LIBCRC32C 503 select LIBCRC32C
504 select ZLIB_INFLATE
505 select ZLIB_DEFLATE
504 help 506 help
505 Btrfs is a new filesystem with extents, writable snapshotting, 507 Btrfs is a new filesystem with extents, writable snapshotting,
506 support for multiple devices and many more features. 508 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142..d2cf5a54a4b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 transaction.o inode.o file.o tree-defrag.o \ 7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
11else 12else
12 13
13# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..c5470367ca5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include "ctree.h"
37#include "disk-io.h"
38#include "transaction.h"
39#include "btrfs_inode.h"
40#include "volumes.h"
41#include "ordered-data.h"
42#include "compat.h"
43#include "compression.h"
44#include "extent_io.h"
45#include "extent_map.h"
46
47struct compressed_bio {
48 /* number of bios pending for this compressed extent */
49 atomic_t pending_bios;
50
51 /* the pages with the compressed data on them */
52 struct page **compressed_pages;
53
54 /* inode that owns this data */
55 struct inode *inode;
56
57 /* starting offset in the inode for our pages */
58 u64 start;
59
60 /* number of bytes in the inode we're working on */
61 unsigned long len;
62
63 /* number of bytes on disk */
64 unsigned long compressed_len;
65
66 /* number of compressed pages in the array */
67 unsigned long nr_pages;
68
69 /* IO errors */
70 int errors;
71
72 /* for reads, this is the bio we are copying the data into */
73 struct bio *orig_bio;
74};
75
76static struct bio *compressed_bio_alloc(struct block_device *bdev,
77 u64 first_byte, gfp_t gfp_flags)
78{
79 struct bio *bio;
80 int nr_vecs;
81
82 nr_vecs = bio_get_nr_vecs(bdev);
83 bio = bio_alloc(gfp_flags, nr_vecs);
84
85 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
86 while (!bio && (nr_vecs /= 2))
87 bio = bio_alloc(gfp_flags, nr_vecs);
88 }
89
90 if (bio) {
91 bio->bi_size = 0;
92 bio->bi_bdev = bdev;
93 bio->bi_sector = first_byte >> 9;
94 }
95 return bio;
96}
97
98/* when we finish reading compressed pages from the disk, we
99 * decompress them and then run the bio end_io routines on the
100 * decompressed pages (in the inode address space).
101 *
102 * This allows the checksumming and other IO error handling routines
103 * to work normally
104 *
105 * The compressed pages are freed here, and it must be run
106 * in process context
107 */
108static void end_compressed_bio_read(struct bio *bio, int err)
109{
110 struct extent_io_tree *tree;
111 struct compressed_bio *cb = bio->bi_private;
112 struct inode *inode;
113 struct page *page;
114 unsigned long index;
115 int ret;
116
117 if (err)
118 cb->errors = 1;
119
120 /* if there are more bios still pending for this compressed
121 * extent, just exit
122 */
123 if (!atomic_dec_and_test(&cb->pending_bios))
124 goto out;
125
126 /* ok, we're the last bio for this extent, lets start
127 * the decompression.
128 */
129 inode = cb->inode;
130 tree = &BTRFS_I(inode)->io_tree;
131 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
132 cb->start,
133 cb->orig_bio->bi_io_vec,
134 cb->orig_bio->bi_vcnt,
135 cb->compressed_len);
136 if (ret)
137 cb->errors = 1;
138
139 /* release the compressed pages */
140 index = 0;
141 for (index = 0; index < cb->nr_pages; index++) {
142 page = cb->compressed_pages[index];
143 page->mapping = NULL;
144 page_cache_release(page);
145 }
146
147 /* do io completion on the original bio */
148 if (cb->errors)
149 bio_io_error(cb->orig_bio);
150 else
151 bio_endio(cb->orig_bio, 0);
152
153 /* finally free the cb struct */
154 kfree(cb->compressed_pages);
155 kfree(cb);
156out:
157 bio_put(bio);
158}
159
160/*
161 * Clear the writeback bits on all of the file
162 * pages for a compressed write
163 */
164static noinline int end_compressed_writeback(struct inode *inode, u64 start,
165 unsigned long ram_size)
166{
167 unsigned long index = start >> PAGE_CACHE_SHIFT;
168 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
169 struct page *pages[16];
170 unsigned long nr_pages = end_index - index + 1;
171 int i;
172 int ret;
173
174 while(nr_pages > 0) {
175 ret = find_get_pages_contig(inode->i_mapping, index,
176 min(nr_pages, ARRAY_SIZE(pages)), pages);
177 if (ret == 0) {
178 nr_pages -= 1;
179 index += 1;
180 continue;
181 }
182 for (i = 0; i < ret; i++) {
183 end_page_writeback(pages[i]);
184 page_cache_release(pages[i]);
185 }
186 nr_pages -= ret;
187 index += ret;
188 }
189 /* the inode may be gone now */
190 return 0;
191}
192
193/*
194 * do the cleanup once all the compressed pages hit the disk.
195 * This will clear writeback on the file pages and free the compressed
196 * pages.
197 *
198 * This also calls the writeback end hooks for the file pages so that
199 * metadata and checksums can be updated in the file.
200 */
201static void end_compressed_bio_write(struct bio *bio, int err)
202{
203 struct extent_io_tree *tree;
204 struct compressed_bio *cb = bio->bi_private;
205 struct inode *inode;
206 struct page *page;
207 unsigned long index;
208
209 if (err)
210 cb->errors = 1;
211
212 /* if there are more bios still pending for this compressed
213 * extent, just exit
214 */
215 if (!atomic_dec_and_test(&cb->pending_bios))
216 goto out;
217
218 /* ok, we're the last bio for this extent, step one is to
219 * call back into the FS and do all the end_io operations
220 */
221 inode = cb->inode;
222 tree = &BTRFS_I(inode)->io_tree;
223 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
224 cb->start,
225 cb->start + cb->len - 1,
226 NULL, 1);
227
228 end_compressed_writeback(inode, cb->start, cb->len);
229 /* note, our inode could be gone now */
230
231 /*
232 * release the compressed pages, these came from alloc_page and
233 * are not attached to the inode at all
234 */
235 index = 0;
236 for (index = 0; index < cb->nr_pages; index++) {
237 page = cb->compressed_pages[index];
238 page->mapping = NULL;
239 page_cache_release(page);
240 }
241
242 /* finally free the cb struct */
243 kfree(cb->compressed_pages);
244 kfree(cb);
245out:
246 bio_put(bio);
247}
248
249/*
250 * worker function to build and submit bios for previously compressed pages.
251 * The corresponding pages in the inode should be marked for writeback
252 * and the compressed pages should have a reference on them for dropping
253 * when the IO is complete.
254 *
255 * This also checksums the file bytes and gets things ready for
256 * the end io hooks.
257 */
258int btrfs_submit_compressed_write(struct inode *inode, u64 start,
259 unsigned long len, u64 disk_start,
260 unsigned long compressed_len,
261 struct page **compressed_pages,
262 unsigned long nr_pages)
263{
264 struct bio *bio = NULL;
265 struct btrfs_root *root = BTRFS_I(inode)->root;
266 struct compressed_bio *cb;
267 unsigned long bytes_left;
268 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
269 int page_index = 0;
270 struct page *page;
271 u64 first_byte = disk_start;
272 struct block_device *bdev;
273 int ret;
274
275 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
276 cb = kmalloc(sizeof(*cb), GFP_NOFS);
277 atomic_set(&cb->pending_bios, 0);
278 cb->errors = 0;
279 cb->inode = inode;
280 cb->start = start;
281 cb->len = len;
282 cb->compressed_pages = compressed_pages;
283 cb->compressed_len = compressed_len;
284 cb->orig_bio = NULL;
285 cb->nr_pages = nr_pages;
286
287 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
288
289 ret = btrfs_csum_file_bytes(root, inode, start, len);
290 BUG_ON(ret);
291
292 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
293 bio->bi_private = cb;
294 bio->bi_end_io = end_compressed_bio_write;
295 atomic_inc(&cb->pending_bios);
296
297 /* create and submit bios for the compressed pages */
298 bytes_left = compressed_len;
299 while(bytes_left > 0) {
300 page = compressed_pages[page_index];
301 page->mapping = inode->i_mapping;
302 if (bio->bi_size)
303 ret = io_tree->ops->merge_bio_hook(page, 0,
304 PAGE_CACHE_SIZE,
305 bio, 0);
306 else
307 ret = 0;
308
309 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
310 PAGE_CACHE_SIZE) {
311 bio_get(bio);
312
313 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
314 BUG_ON(ret);
315
316 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
317 BUG_ON(ret);
318
319 bio_put(bio);
320
321 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
322 atomic_inc(&cb->pending_bios);
323 bio->bi_private = cb;
324 bio->bi_end_io = end_compressed_bio_write;
325 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
326 }
327 page_index++;
328 bytes_left -= PAGE_CACHE_SIZE;
329 first_byte += PAGE_CACHE_SIZE;
330 }
331 bio_get(bio);
332
333 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
334 BUG_ON(ret);
335
336 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
337 BUG_ON(ret);
338
339 bio_put(bio);
340 return 0;
341}
342
343/*
344 * for a compressed read, the bio we get passed has all the inode pages
345 * in it. We don't actually do IO on those pages but allocate new ones
346 * to hold the compressed pages on disk.
347 *
348 * bio->bi_sector points to the compressed extent on disk
349 * bio->bi_io_vec points to all of the inode pages
350 * bio->bi_vcnt is a count of pages
351 *
352 * After the compressed pages are read, we copy the bytes into the
353 * bio we were passed and then call the bio end_io calls
354 */
355int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
356 int mirror_num, unsigned long bio_flags)
357{
358 struct extent_io_tree *tree;
359 struct extent_map_tree *em_tree;
360 struct compressed_bio *cb;
361 struct btrfs_root *root = BTRFS_I(inode)->root;
362 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
363 unsigned long compressed_len;
364 unsigned long nr_pages;
365 unsigned long page_index;
366 struct page *page;
367 struct block_device *bdev;
368 struct bio *comp_bio;
369 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
370 struct extent_map *em;
371 int ret;
372
373 tree = &BTRFS_I(inode)->io_tree;
374 em_tree = &BTRFS_I(inode)->extent_tree;
375
376 /* we need the actual starting offset of this extent in the file */
377 spin_lock(&em_tree->lock);
378 em = lookup_extent_mapping(em_tree,
379 page_offset(bio->bi_io_vec->bv_page),
380 PAGE_CACHE_SIZE);
381 spin_unlock(&em_tree->lock);
382
383 cb = kmalloc(sizeof(*cb), GFP_NOFS);
384 atomic_set(&cb->pending_bios, 0);
385 cb->errors = 0;
386 cb->inode = inode;
387
388 cb->start = em->start;
389 compressed_len = em->block_len;
390 free_extent_map(em);
391
392 cb->len = uncompressed_len;
393 cb->compressed_len = compressed_len;
394 cb->orig_bio = bio;
395
396 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
397 PAGE_CACHE_SIZE;
398 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
399 GFP_NOFS);
400 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
401
402 for (page_index = 0; page_index < nr_pages; page_index++) {
403 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
404 __GFP_HIGHMEM);
405 }
406 cb->nr_pages = nr_pages;
407
408 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
409 comp_bio->bi_private = cb;
410 comp_bio->bi_end_io = end_compressed_bio_read;
411 atomic_inc(&cb->pending_bios);
412
413 for (page_index = 0; page_index < nr_pages; page_index++) {
414 page = cb->compressed_pages[page_index];
415 page->mapping = inode->i_mapping;
416 if (comp_bio->bi_size)
417 ret = tree->ops->merge_bio_hook(page, 0,
418 PAGE_CACHE_SIZE,
419 comp_bio, 0);
420 else
421 ret = 0;
422
423 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
424 PAGE_CACHE_SIZE) {
425 bio_get(comp_bio);
426
427 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
431 BUG_ON(ret);
432
433 bio_put(comp_bio);
434
435 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
436 GFP_NOFS);
437 atomic_inc(&cb->pending_bios);
438 bio->bi_private = cb;
439 bio->bi_end_io = end_compressed_bio_write;
440 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
441 }
442 cur_disk_byte += PAGE_CACHE_SIZE;
443 }
444 bio_get(comp_bio);
445
446 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
447 BUG_ON(ret);
448
449 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
450 BUG_ON(ret);
451
452 bio_put(comp_bio);
453 return 0;
454}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47..793d8fdda24 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
400 __le32 nsec; 400 __le32 nsec;
401} __attribute__ ((__packed__)); 401} __attribute__ ((__packed__));
402 402
403/* 403typedef enum {
404 * there is no padding here on purpose. If you want to extent the inode, 404 BTRFS_COMPRESS_NONE = 0,
405 * make a new item type 405 BTRFS_COMPRESS_ZLIB = 1,
406 */ 406 BTRFS_COMPRESS_LAST = 2,
407} btrfs_compression_type;
408
409/* we don't understand any encryption methods right now */
410typedef enum {
411 BTRFS_ENCRYPTION_NONE = 0,
412 BTRFS_ENCRYPTION_LAST = 1,
413} btrfs_encryption_type;
414
407struct btrfs_inode_item { 415struct btrfs_inode_item {
408 /* nfs style generation number */ 416 /* nfs style generation number */
409 __le64 generation; 417 __le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
419 __le64 rdev; 427 __le64 rdev;
420 __le16 flags; 428 __le16 flags;
421 __le16 compat_flags; 429 __le16 compat_flags;
430
422 struct btrfs_timespec atime; 431 struct btrfs_timespec atime;
423 struct btrfs_timespec ctime; 432 struct btrfs_timespec ctime;
424 struct btrfs_timespec mtime; 433 struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
454#define BTRFS_FILE_EXTENT_INLINE 1 463#define BTRFS_FILE_EXTENT_INLINE 1
455 464
456struct btrfs_file_extent_item { 465struct btrfs_file_extent_item {
466 /*
467 * transaction id that created this extent
468 */
457 __le64 generation; 469 __le64 generation;
470 /*
471 * max number of bytes to hold this extent in ram
472 * when we split a compressed extent we can't know how big
473 * each of the resulting pieces will be. So, this is
474 * an upper limit on the size of the extent in ram instead of
475 * an exact limit.
476 */
477 __le64 ram_bytes;
478
479 /*
480 * 32 bits for the various ways we might encode the data,
481 * including compression and encryption. If any of these
482 * are set to something a given disk format doesn't understand
483 * it is treated like an incompat flag for reading and writing,
484 * but not for stat.
485 */
486 u8 compression;
487 u8 encryption;
488 __le16 other_encoding; /* spare for later use */
489
490 /* are we inline data or a real extent? */
458 u8 type; 491 u8 type;
492
459 /* 493 /*
460 * disk space consumed by the extent, checksum blocks are included 494 * disk space consumed by the extent, checksum blocks are included
461 * in these numbers 495 * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
471 */ 505 */
472 __le64 offset; 506 __le64 offset;
473 /* 507 /*
474 * the logical number of file blocks (no csums included) 508 * the logical number of file blocks (no csums included). This
509 * always reflects the size uncompressed and without encoding.
475 */ 510 */
476 __le64 num_bytes; 511 __le64 num_bytes;
512
477} __attribute__ ((__packed__)); 513} __attribute__ ((__packed__));
478 514
479struct btrfs_csum_item { 515struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
814#define BTRFS_MOUNT_NOBARRIER (1 << 2) 850#define BTRFS_MOUNT_NOBARRIER (1 << 2)
815#define BTRFS_MOUNT_SSD (1 << 3) 851#define BTRFS_MOUNT_SSD (1 << 3)
816#define BTRFS_MOUNT_DEGRADED (1 << 4) 852#define BTRFS_MOUNT_DEGRADED (1 << 4)
853#define BTRFS_MOUNT_COMPRESS (1 << 5)
817 854
818#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 855#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
819#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 856#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
825#define BTRFS_INODE_NODATASUM (1 << 0) 862#define BTRFS_INODE_NODATASUM (1 << 0)
826#define BTRFS_INODE_NODATACOW (1 << 1) 863#define BTRFS_INODE_NODATACOW (1 << 1)
827#define BTRFS_INODE_READONLY (1 << 2) 864#define BTRFS_INODE_READONLY (1 << 2)
865#define BTRFS_INODE_NOCOMPRESS (1 << 3)
828#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ 866#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
829 ~BTRFS_INODE_##flag) 867 ~BTRFS_INODE_##flag)
830#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ 868#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1424 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; 1462 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1425} 1463}
1426 1464
1427static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1428 struct btrfs_item *e)
1429{
1430 unsigned long offset;
1431 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1432 return btrfs_item_size(eb, e) - offset;
1433}
1434
1435BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, 1465BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1436 disk_bytenr, 64); 1466 disk_bytenr, 64);
1437BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, 1467BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1442 offset, 64); 1472 offset, 64);
1443BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, 1473BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1444 num_bytes, 64); 1474 num_bytes, 64);
1475BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1476 ram_bytes, 64);
1477BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1478 compression, 8);
1479BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1480 encryption, 8);
1481BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1482 other_encoding, 16);
1483
1484/* this returns the number of file bytes represented by the inline item.
1485 * If an item is compressed, this is the uncompressed size
1486 */
1487static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1488 struct btrfs_file_extent_item *e)
1489{
1490 return btrfs_file_extent_ram_bytes(eb, e);
1491}
1492
1493/*
1494 * this returns the number of bytes used by the item on disk, minus the
1495 * size of any extent headers. If a file is compressed on disk, this is
1496 * the compressed size
1497 */
1498static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1499 struct btrfs_item *e)
1500{
1501 unsigned long offset;
1502 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1503 return btrfs_item_size(eb, e) - offset;
1504}
1445 1505
1446static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 1506static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1447{ 1507{
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1745int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 1805int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1746 struct bio *bio); 1806 struct bio *bio);
1747int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 1807int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1748 struct btrfs_root *root, 1808 struct btrfs_root *root,
1749 u64 objectid, u64 pos, u64 disk_offset, 1809 u64 objectid, u64 pos,
1750 u64 disk_num_bytes, 1810 u64 disk_offset, u64 disk_num_bytes,
1751 u64 num_bytes, u64 offset); 1811 u64 num_bytes, u64 offset, u64 ram_bytes,
1812 u8 compression, u8 encryption, u16 other_encoding);
1752int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 1813int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1753 struct btrfs_root *root, 1814 struct btrfs_root *root,
1754 struct btrfs_path *path, u64 objectid, 1815 struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1758 struct btrfs_ordered_sum *sums); 1819 struct btrfs_ordered_sum *sums);
1759int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 1820int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1760 struct bio *bio); 1821 struct bio *bio);
1822int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1823 u64 start, unsigned long len);
1761struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, 1824struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1762 struct btrfs_root *root, 1825 struct btrfs_root *root,
1763 struct btrfs_path *path, 1826 struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
1799 int namelen); 1862 int namelen);
1800 1863
1801int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1864int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1802 size_t size, struct bio *bio); 1865 size_t size, struct bio *bio, unsigned long bio_flags);
1803 1866
1804unsigned long btrfs_force_ra(struct address_space *mapping, 1867unsigned long btrfs_force_ra(struct address_space *mapping,
1805 struct file_ra_state *ra, struct file *file, 1868 struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb619..dc95f636a11 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
83 extent_submit_bio_hook_t *submit_bio_hook; 83 extent_submit_bio_hook_t *submit_bio_hook;
84 int rw; 84 int rw;
85 int mirror_num; 85 int mirror_num;
86 unsigned long bio_flags;
86 struct btrfs_work work; 87 struct btrfs_work work;
87}; 88};
88 89
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
115 } 116 }
116 em->start = 0; 117 em->start = 0;
117 em->len = (u64)-1; 118 em->len = (u64)-1;
119 em->block_len = (u64)-1;
118 em->block_start = 0; 120 em->block_start = 0;
119 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 121 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
120 122
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
469 wake_up(&fs_info->async_submit_wait); 471 wake_up(&fs_info->async_submit_wait);
470 472
471 async->submit_bio_hook(async->inode, async->rw, async->bio, 473 async->submit_bio_hook(async->inode, async->rw, async->bio,
472 async->mirror_num); 474 async->mirror_num, async->bio_flags);
473 kfree(async); 475 kfree(async);
474} 476}
475 477
476int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 478int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
477 int rw, struct bio *bio, int mirror_num, 479 int rw, struct bio *bio, int mirror_num,
480 unsigned long bio_flags,
478 extent_submit_bio_hook_t *submit_bio_hook) 481 extent_submit_bio_hook_t *submit_bio_hook)
479{ 482{
480 struct async_submit_bio *async; 483 struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
491 async->submit_bio_hook = submit_bio_hook; 494 async->submit_bio_hook = submit_bio_hook;
492 async->work.func = run_one_async_submit; 495 async->work.func = run_one_async_submit;
493 async->work.flags = 0; 496 async->work.flags = 0;
497 async->bio_flags = bio_flags;
494 498
495 while(atomic_read(&fs_info->async_submit_draining) && 499 while(atomic_read(&fs_info->async_submit_draining) &&
496 atomic_read(&fs_info->nr_async_submits)) { 500 atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
530} 534}
531 535
532static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 536static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
533 int mirror_num) 537 int mirror_num, unsigned long bio_flags)
534{ 538{
535 struct btrfs_root *root = BTRFS_I(inode)->root; 539 struct btrfs_root *root = BTRFS_I(inode)->root;
536 int ret; 540 int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
556} 560}
557 561
558static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 562static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
559 int mirror_num) 563 int mirror_num, unsigned long bio_flags)
560{ 564{
561 /* 565 /*
562 * kthread helpers are used to submit writes so that checksumming 566 * kthread helpers are used to submit writes so that checksumming
563 * can happen in parallel across all CPUs 567 * can happen in parallel across all CPUs
564 */ 568 */
565 if (!(rw & (1 << BIO_RW))) { 569 if (!(rw & (1 << BIO_RW))) {
566 return __btree_submit_bio_hook(inode, rw, bio, mirror_num); 570 return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
567 } 571 }
568 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 572 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
569 inode, rw, bio, mirror_num, 573 inode, rw, bio, mirror_num, 0,
570 __btree_submit_bio_hook); 574 __btree_submit_bio_hook);
571} 575}
572 576
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1407 fs_info->btree_inode = new_inode(sb); 1411 fs_info->btree_inode = new_inode(sb);
1408 fs_info->btree_inode->i_ino = 1; 1412 fs_info->btree_inode->i_ino = 1;
1409 fs_info->btree_inode->i_nlink = 1; 1413 fs_info->btree_inode->i_nlink = 1;
1414
1410 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); 1415 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1411 1416
1412 INIT_LIST_HEAD(&fs_info->ordered_extents); 1417 INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1508 */ 1513 */
1509 btrfs_init_workers(&fs_info->workers, "worker", 1514 btrfs_init_workers(&fs_info->workers, "worker",
1510 fs_info->thread_pool_size); 1515 fs_info->thread_pool_size);
1516
1511 btrfs_init_workers(&fs_info->submit_workers, "submit", 1517 btrfs_init_workers(&fs_info->submit_workers, "submit",
1512 min_t(u64, fs_devices->num_devices, 1518 min_t(u64, fs_devices->num_devices,
1513 fs_info->thread_pool_size)); 1519 fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1559 } 1565 }
1560 1566
1561 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1567 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1568 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1569 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1562 1570
1563 nodesize = btrfs_super_nodesize(disk_super); 1571 nodesize = btrfs_super_nodesize(disk_super);
1564 leafsize = btrfs_super_leafsize(disk_super); 1572 leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbb..4eb1f1408d2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
71 int metadata); 71 int metadata);
72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
73 int rw, struct bio *bio, int mirror_num, 73 int rw, struct bio *bio, int mirror_num,
74 unsigned long bio_flags,
74 extent_submit_bio_hook_t *submit_bio_hook); 75 extent_submit_bio_hook_t *submit_bio_hook);
75int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); 76int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
76unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 77unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6..bbf04e80a1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
3278 3278
3279 em->start = extent_key->objectid - offset; 3279 em->start = extent_key->objectid - offset;
3280 em->len = extent_key->offset; 3280 em->len = extent_key->offset;
3281 em->block_len = extent_key->offset;
3281 em->block_start = extent_key->objectid; 3282 em->block_start = extent_key->objectid;
3282 em->bdev = root->fs_info->fs_devices->latest_bdev; 3283 em->bdev = root->fs_info->fs_devices->latest_bdev;
3283 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3284 set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
3314}; 3315};
3315 3316
3316struct disk_extent { 3317struct disk_extent {
3318 u64 ram_bytes;
3317 u64 disk_bytenr; 3319 u64 disk_bytenr;
3318 u64 disk_num_bytes; 3320 u64 disk_num_bytes;
3319 u64 offset; 3321 u64 offset;
3320 u64 num_bytes; 3322 u64 num_bytes;
3323 u8 compression;
3324 u8 encryption;
3325 u16 other_encoding;
3321}; 3326};
3322 3327
3323static int is_cowonly_root(u64 root_objectid) 3328static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
3631 btrfs_file_extent_disk_num_bytes(leaf, fi); 3636 btrfs_file_extent_disk_num_bytes(leaf, fi);
3632 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 3637 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
3633 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 3638 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
3639 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
3640 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
3641 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
3642 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
3643 fi);
3634 WARN_ON(exts[nr].offset > 0); 3644 WARN_ON(exts[nr].offset > 0);
3635 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 3645 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
3636 3646
@@ -3846,6 +3856,8 @@ next:
3846 new_extents[0].disk_bytenr); 3856 new_extents[0].disk_bytenr);
3847 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 3857 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3848 new_extents[0].disk_num_bytes); 3858 new_extents[0].disk_num_bytes);
3859 btrfs_set_file_extent_ram_bytes(leaf, fi,
3860 new_extents[0].ram_bytes);
3849 ext_offset += new_extents[0].offset; 3861 ext_offset += new_extents[0].offset;
3850 btrfs_set_file_extent_offset(leaf, fi, ext_offset); 3862 btrfs_set_file_extent_offset(leaf, fi, ext_offset);
3851 btrfs_mark_buffer_dirty(leaf); 3863 btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
3911 new_extents[i].disk_bytenr); 3923 new_extents[i].disk_bytenr);
3912 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 3924 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3913 new_extents[i].disk_num_bytes); 3925 new_extents[i].disk_num_bytes);
3926 btrfs_set_file_extent_ram_bytes(leaf, fi,
3927 new_extents[i].ram_bytes);
3928
3929 btrfs_set_file_extent_compression(leaf, fi,
3930 new_extents[i].compression);
3931 btrfs_set_file_extent_encryption(leaf, fi,
3932 new_extents[i].encryption);
3933 btrfs_set_file_extent_other_encoding(leaf, fi,
3934 new_extents[i].other_encoding);
3935
3914 btrfs_set_file_extent_num_bytes(leaf, fi, 3936 btrfs_set_file_extent_num_bytes(leaf, fi,
3915 extent_len); 3937 extent_len);
3916 ext_offset += new_extents[i].offset; 3938 ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4169 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 4191 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4170 4192
4171 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 4193 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
4194 btrfs_set_file_extent_ram_bytes(leaf, fi,
4195 new_extent->ram_bytes);
4172 btrfs_set_file_extent_disk_bytenr(leaf, fi, 4196 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4173 new_extent->disk_bytenr); 4197 new_extent->disk_bytenr);
4174 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 4198 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
4847 BUG_ON(err); 4871 BUG_ON(err);
4848 4872
4849 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, 4873 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
4850 group->key.offset, 0); 4874 group->key.offset, 0, group->key.offset,
4875 0, 0, 0);
4851 BUG_ON(err); 4876 BUG_ON(err);
4852 4877
4853 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 4878 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f..314041fdfa4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
30static LIST_HEAD(buffers); 30static LIST_HEAD(buffers);
31static LIST_HEAD(states); 31static LIST_HEAD(states);
32 32
33#define LEAK_DEBUG 1
33#ifdef LEAK_DEBUG 34#ifdef LEAK_DEBUG
34static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; 35static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35#endif 36#endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
1067 * 1068 *
1068 * 1 is returned if we find something, 0 if nothing was in the tree 1069 * 1 is returned if we find something, 0 if nothing was in the tree
1069 */ 1070 */
1070static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, 1071static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1071 u64 *start, u64 *end, u64 max_bytes) 1072 u64 *start, u64 *end, u64 max_bytes)
1072{ 1073{
1073 struct rb_node *node; 1074 struct rb_node *node;
1074 struct extent_state *state; 1075 struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1077 u64 total_bytes = 0; 1078 u64 total_bytes = 0;
1078 1079
1079 spin_lock_irq(&tree->lock); 1080 spin_lock_irq(&tree->lock);
1081
1080 /* 1082 /*
1081 * this search will find all the extents that end after 1083 * this search will find all the extents that end after
1082 * our range starts. 1084 * our range starts.
1083 */ 1085 */
1084search_again:
1085 node = tree_search(tree, cur_start); 1086 node = tree_search(tree, cur_start);
1086 if (!node) { 1087 if (!node) {
1087 if (!found) 1088 if (!found)
@@ -1100,40 +1101,6 @@ search_again:
1100 *end = state->end; 1101 *end = state->end;
1101 goto out; 1102 goto out;
1102 } 1103 }
1103 if (!found && !(state->state & EXTENT_BOUNDARY)) {
1104 struct extent_state *prev_state;
1105 struct rb_node *prev_node = node;
1106 while(1) {
1107 prev_node = rb_prev(prev_node);
1108 if (!prev_node)
1109 break;
1110 prev_state = rb_entry(prev_node,
1111 struct extent_state,
1112 rb_node);
1113 if ((prev_state->end + 1 != state->start) ||
1114 !(prev_state->state & EXTENT_DELALLOC))
1115 break;
1116 if ((cur_start - prev_state->start) * 2 >
1117 max_bytes)
1118 break;
1119 state = prev_state;
1120 node = prev_node;
1121 }
1122 }
1123 if (state->state & EXTENT_LOCKED) {
1124 DEFINE_WAIT(wait);
1125 atomic_inc(&state->refs);
1126 prepare_to_wait(&state->wq, &wait,
1127 TASK_UNINTERRUPTIBLE);
1128 spin_unlock_irq(&tree->lock);
1129 schedule();
1130 spin_lock_irq(&tree->lock);
1131 finish_wait(&state->wq, &wait);
1132 free_extent_state(state);
1133 goto search_again;
1134 }
1135 set_state_cb(tree, state, EXTENT_LOCKED);
1136 state->state |= EXTENT_LOCKED;
1137 if (!found) 1104 if (!found)
1138 *start = state->start; 1105 *start = state->start;
1139 found++; 1106 found++;
@@ -1151,6 +1118,208 @@ out:
1151 return found; 1118 return found;
1152} 1119}
1153 1120
1121static noinline int __unlock_for_delalloc(struct inode *inode,
1122 struct page *locked_page,
1123 u64 start, u64 end)
1124{
1125 int ret;
1126 struct page *pages[16];
1127 unsigned long index = start >> PAGE_CACHE_SHIFT;
1128 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1129 unsigned long nr_pages = end_index - index + 1;
1130 int i;
1131
1132 if (index == locked_page->index && end_index == index)
1133 return 0;
1134
1135 while(nr_pages > 0) {
1136 ret = find_get_pages_contig(inode->i_mapping, index,
1137 min(nr_pages, ARRAY_SIZE(pages)), pages);
1138 for (i = 0; i < ret; i++) {
1139 if (pages[i] != locked_page)
1140 unlock_page(pages[i]);
1141 page_cache_release(pages[i]);
1142 }
1143 nr_pages -= ret;
1144 index += ret;
1145 cond_resched();
1146 }
1147 return 0;
1148}
1149
1150static noinline int lock_delalloc_pages(struct inode *inode,
1151 struct page *locked_page,
1152 u64 delalloc_start,
1153 u64 delalloc_end)
1154{
1155 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1156 unsigned long start_index = index;
1157 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1158 unsigned long pages_locked = 0;
1159 struct page *pages[16];
1160 unsigned long nrpages;
1161 int ret;
1162 int i;
1163
1164 /* the caller is responsible for locking the start index */
1165 if (index == locked_page->index && index == end_index)
1166 return 0;
1167
1168 /* skip the page at the start index */
1169 nrpages = end_index - index + 1;
1170 while(nrpages > 0) {
1171 ret = find_get_pages_contig(inode->i_mapping, index,
1172 min(nrpages, ARRAY_SIZE(pages)), pages);
1173 if (ret == 0) {
1174 ret = -EAGAIN;
1175 goto done;
1176 }
1177 /* now we have an array of pages, lock them all */
1178 for (i = 0; i < ret; i++) {
1179 /*
1180 * the caller is taking responsibility for
1181 * locked_page
1182 */
1183 if (pages[i] != locked_page)
1184 lock_page(pages[i]);
1185 page_cache_release(pages[i]);
1186 }
1187 pages_locked += ret;
1188 nrpages -= ret;
1189 index += ret;
1190 cond_resched();
1191 }
1192 ret = 0;
1193done:
1194 if (ret && pages_locked) {
1195 __unlock_for_delalloc(inode, locked_page,
1196 delalloc_start,
1197 ((u64)(start_index + pages_locked - 1)) <<
1198 PAGE_CACHE_SHIFT);
1199 }
1200 return ret;
1201}
1202
1203/*
1204 * find a contiguous range of bytes in the file marked as delalloc, not
1205 * more than 'max_bytes'. start and end are used to return the range,
1206 *
1207 * 1 is returned if we find something, 0 if nothing was in the tree
1208 */
1209static noinline u64 find_lock_delalloc_range(struct inode *inode,
1210 struct extent_io_tree *tree,
1211 struct page *locked_page,
1212 u64 *start, u64 *end,
1213 u64 max_bytes)
1214{
1215 u64 delalloc_start;
1216 u64 delalloc_end;
1217 u64 found;
1218 int ret;
1219 int loops = 0;
1220
1221again:
1222 /* step one, find a bunch of delalloc bytes starting at start */
1223 delalloc_start = *start;
1224 delalloc_end = 0;
1225 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1226 max_bytes);
1227 if (!found) {
1228 *start = delalloc_start;
1229 *end = delalloc_end;
1230 return found;
1231 }
1232
1233 /*
1234 * make sure to limit the number of pages we try to lock down
1235 * if we're looping.
1236 */
1237 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1238 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
1239 ~((u64)PAGE_CACHE_SIZE - 1);
1240 }
1241 /* step two, lock all the pages after the page that has start */
1242 ret = lock_delalloc_pages(inode, locked_page,
1243 delalloc_start, delalloc_end);
1244 if (ret == -EAGAIN) {
1245 /* some of the pages are gone, lets avoid looping by
1246 * shortening the size of the delalloc range we're searching
1247 */
1248 if (!loops) {
1249 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1250 max_bytes = PAGE_CACHE_SIZE - offset;
1251 loops = 1;
1252 goto again;
1253 } else {
1254 found = 0;
1255 goto out_failed;
1256 }
1257 }
1258 BUG_ON(ret);
1259
1260 /* step three, lock the state bits for the whole range */
1261 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1262
1263 /* then test to make sure it is all still delalloc */
1264 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1265 EXTENT_DELALLOC, 1);
1266 if (!ret) {
1267 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1268 __unlock_for_delalloc(inode, locked_page,
1269 delalloc_start, delalloc_end);
1270 cond_resched();
1271 goto again;
1272 }
1273 *start = delalloc_start;
1274 *end = delalloc_end;
1275out_failed:
1276 return found;
1277}
1278
1279int extent_clear_unlock_delalloc(struct inode *inode,
1280 struct extent_io_tree *tree,
1281 u64 start, u64 end, struct page *locked_page,
1282 int clear_dirty, int set_writeback,
1283 int end_writeback)
1284{
1285 int ret;
1286 struct page *pages[16];
1287 unsigned long index = start >> PAGE_CACHE_SHIFT;
1288 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1289 unsigned long nr_pages = end_index - index + 1;
1290 int i;
1291 int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1292
1293 if (clear_dirty)
1294 clear_bits |= EXTENT_DIRTY;
1295
1296 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1297
1298 while(nr_pages > 0) {
1299 ret = find_get_pages_contig(inode->i_mapping, index,
1300 min(nr_pages, ARRAY_SIZE(pages)), pages);
1301 for (i = 0; i < ret; i++) {
1302 if (pages[i] == locked_page) {
1303 page_cache_release(pages[i]);
1304 continue;
1305 }
1306 if (clear_dirty)
1307 clear_page_dirty_for_io(pages[i]);
1308 if (set_writeback)
1309 set_page_writeback(pages[i]);
1310 if (end_writeback)
1311 end_page_writeback(pages[i]);
1312 unlock_page(pages[i]);
1313 page_cache_release(pages[i]);
1314 }
1315 nr_pages -= ret;
1316 index += ret;
1317 cond_resched();
1318 }
1319 return 0;
1320}
1321EXPORT_SYMBOL(extent_clear_unlock_delalloc);
1322
1154/* 1323/*
1155 * count the number of bytes in the tree that have a given bit(s) 1324 * count the number of bytes in the tree that have a given bit(s)
1156 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1325 * set. This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1631 return bio; 1800 return bio;
1632} 1801}
1633 1802
1634static int submit_one_bio(int rw, struct bio *bio, int mirror_num) 1803static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1804 unsigned long bio_flags)
1635{ 1805{
1636 int ret = 0; 1806 int ret = 0;
1637 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1807 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1638 struct page *page = bvec->bv_page; 1808 struct page *page = bvec->bv_page;
1639 struct extent_io_tree *tree = bio->bi_private; 1809 struct extent_io_tree *tree = bio->bi_private;
1640 struct rb_node *node;
1641 struct extent_state *state;
1642 u64 start; 1810 u64 start;
1643 u64 end; 1811 u64 end;
1644 1812
1645 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1813 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1646 end = start + bvec->bv_len - 1; 1814 end = start + bvec->bv_len - 1;
1647 1815
1648 spin_lock_irq(&tree->lock);
1649 node = __etree_search(tree, start, NULL, NULL);
1650 BUG_ON(!node);
1651 state = rb_entry(node, struct extent_state, rb_node);
1652 while(state->end < end) {
1653 node = rb_next(node);
1654 state = rb_entry(node, struct extent_state, rb_node);
1655 }
1656 BUG_ON(state->end != end);
1657 spin_unlock_irq(&tree->lock);
1658
1659 bio->bi_private = NULL; 1816 bio->bi_private = NULL;
1660 1817
1661 bio_get(bio); 1818 bio_get(bio);
1662 1819
1663 if (tree->ops && tree->ops->submit_bio_hook) 1820 if (tree->ops && tree->ops->submit_bio_hook)
1664 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1821 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1665 mirror_num); 1822 mirror_num, bio_flags);
1666 else 1823 else
1667 submit_bio(rw, bio); 1824 submit_bio(rw, bio);
1668 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1825 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1678 struct bio **bio_ret, 1835 struct bio **bio_ret,
1679 unsigned long max_pages, 1836 unsigned long max_pages,
1680 bio_end_io_t end_io_func, 1837 bio_end_io_t end_io_func,
1681 int mirror_num) 1838 int mirror_num,
1839 unsigned long prev_bio_flags,
1840 unsigned long bio_flags)
1682{ 1841{
1683 int ret = 0; 1842 int ret = 0;
1684 struct bio *bio; 1843 struct bio *bio;
1685 int nr; 1844 int nr;
1845 int contig = 0;
1846 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1847 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1848 size_t page_size = min(size, PAGE_CACHE_SIZE);
1686 1849
1687 if (bio_ret && *bio_ret) { 1850 if (bio_ret && *bio_ret) {
1688 bio = *bio_ret; 1851 bio = *bio_ret;
1689 if (bio->bi_sector + (bio->bi_size >> 9) != sector || 1852 if (old_compressed)
1853 contig = bio->bi_sector == sector;
1854 else
1855 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1856 sector;
1857
1858 if (prev_bio_flags != bio_flags || !contig ||
1690 (tree->ops && tree->ops->merge_bio_hook && 1859 (tree->ops && tree->ops->merge_bio_hook &&
1691 tree->ops->merge_bio_hook(page, offset, size, bio)) || 1860 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1692 bio_add_page(bio, page, size, offset) < size) { 1861 bio_flags)) ||
1693 ret = submit_one_bio(rw, bio, mirror_num); 1862 bio_add_page(bio, page, page_size, offset) < page_size) {
1863 ret = submit_one_bio(rw, bio, mirror_num,
1864 prev_bio_flags);
1694 bio = NULL; 1865 bio = NULL;
1695 } else { 1866 } else {
1696 return 0; 1867 return 0;
1697 } 1868 }
1698 } 1869 }
1699 nr = bio_get_nr_vecs(bdev); 1870 if (this_compressed)
1871 nr = BIO_MAX_PAGES;
1872 else
1873 nr = bio_get_nr_vecs(bdev);
1874
1700 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1875 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1701 if (!bio) { 1876 if (!bio) {
1702 printk("failed to allocate bio nr %d\n", nr); 1877 printk("failed to allocate bio nr %d\n", nr);
1703 } 1878 }
1704 1879
1705 1880 bio_add_page(bio, page, page_size, offset);
1706 bio_add_page(bio, page, size, offset);
1707 bio->bi_end_io = end_io_func; 1881 bio->bi_end_io = end_io_func;
1708 bio->bi_private = tree; 1882 bio->bi_private = tree;
1709 1883
1710 if (bio_ret) { 1884 if (bio_ret) {
1711 *bio_ret = bio; 1885 *bio_ret = bio;
1712 } else { 1886 } else {
1713 ret = submit_one_bio(rw, bio, mirror_num); 1887 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1714 } 1888 }
1715 1889
1716 return ret; 1890 return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
1738static int __extent_read_full_page(struct extent_io_tree *tree, 1912static int __extent_read_full_page(struct extent_io_tree *tree,
1739 struct page *page, 1913 struct page *page,
1740 get_extent_t *get_extent, 1914 get_extent_t *get_extent,
1741 struct bio **bio, int mirror_num) 1915 struct bio **bio, int mirror_num,
1916 unsigned long *bio_flags)
1742{ 1917{
1743 struct inode *inode = page->mapping->host; 1918 struct inode *inode = page->mapping->host;
1744 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1919 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
1756 int nr = 0; 1931 int nr = 0;
1757 size_t page_offset = 0; 1932 size_t page_offset = 0;
1758 size_t iosize; 1933 size_t iosize;
1934 size_t disk_io_size;
1759 size_t blocksize = inode->i_sb->s_blocksize; 1935 size_t blocksize = inode->i_sb->s_blocksize;
1936 unsigned long this_bio_flag = 0;
1760 1937
1761 set_page_extent_mapped(page); 1938 set_page_extent_mapped(page);
1762 1939
1763 end = page_end; 1940 end = page_end;
1764 lock_extent(tree, start, end, GFP_NOFS); 1941 lock_extent(tree, start, end, GFP_NOFS);
1765 1942
1943 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1944 char *userpage;
1945 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1946
1947 if (zero_offset) {
1948 iosize = PAGE_CACHE_SIZE - zero_offset;
1949 userpage = kmap_atomic(page, KM_USER0);
1950 memset(userpage + zero_offset, 0, iosize);
1951 flush_dcache_page(page);
1952 kunmap_atomic(userpage, KM_USER0);
1953 }
1954 }
1766 while (cur <= end) { 1955 while (cur <= end) {
1767 if (cur >= last_byte) { 1956 if (cur >= last_byte) {
1768 char *userpage; 1957 char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1793 } 1982 }
1794 BUG_ON(end < cur); 1983 BUG_ON(end < cur);
1795 1984
1985 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1986 this_bio_flag = EXTENT_BIO_COMPRESSED;
1987
1796 iosize = min(extent_map_end(em) - cur, end - cur + 1); 1988 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1797 cur_end = min(extent_map_end(em) - 1, end); 1989 cur_end = min(extent_map_end(em) - 1, end);
1798 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1990 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1799 sector = (em->block_start + extent_offset) >> 9; 1991 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
1992 disk_io_size = em->block_len;
1993 sector = em->block_start >> 9;
1994 } else {
1995 sector = (em->block_start + extent_offset) >> 9;
1996 disk_io_size = iosize;
1997 }
1800 bdev = em->bdev; 1998 bdev = em->bdev;
1801 block_start = em->block_start; 1999 block_start = em->block_start;
1802 free_extent_map(em); 2000 free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1845 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2043 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1846 pnr -= page->index; 2044 pnr -= page->index;
1847 ret = submit_extent_page(READ, tree, page, 2045 ret = submit_extent_page(READ, tree, page,
1848 sector, iosize, page_offset, 2046 sector, disk_io_size, page_offset,
1849 bdev, bio, pnr, 2047 bdev, bio, pnr,
1850 end_bio_extent_readpage, mirror_num); 2048 end_bio_extent_readpage, mirror_num,
2049 *bio_flags,
2050 this_bio_flag);
1851 nr++; 2051 nr++;
2052 *bio_flags = this_bio_flag;
1852 } 2053 }
1853 if (ret) 2054 if (ret)
1854 SetPageError(page); 2055 SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1867 get_extent_t *get_extent) 2068 get_extent_t *get_extent)
1868{ 2069{
1869 struct bio *bio = NULL; 2070 struct bio *bio = NULL;
2071 unsigned long bio_flags = 0;
1870 int ret; 2072 int ret;
1871 2073
1872 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); 2074 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2075 &bio_flags);
1873 if (bio) 2076 if (bio)
1874 submit_one_bio(READ, bio, 0); 2077 submit_one_bio(READ, bio, 0, bio_flags);
1875 return ret; 2078 return ret;
1876} 2079}
1877EXPORT_SYMBOL(extent_read_full_page); 2080EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1909 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2112 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1910 u64 nr_delalloc; 2113 u64 nr_delalloc;
1911 u64 delalloc_end; 2114 u64 delalloc_end;
2115 int page_started;
2116 int compressed;
1912 2117
1913 WARN_ON(!PageLocked(page)); 2118 WARN_ON(!PageLocked(page));
1914 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2119 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1934 2139
1935 delalloc_start = start; 2140 delalloc_start = start;
1936 delalloc_end = 0; 2141 delalloc_end = 0;
2142 page_started = 0;
1937 while(delalloc_end < page_end) { 2143 while(delalloc_end < page_end) {
1938 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, 2144 nr_delalloc = find_lock_delalloc_range(inode, tree,
2145 page,
2146 &delalloc_start,
1939 &delalloc_end, 2147 &delalloc_end,
1940 128 * 1024 * 1024); 2148 128 * 1024 * 1024);
1941 if (nr_delalloc == 0) { 2149 if (nr_delalloc == 0) {
1942 delalloc_start = delalloc_end + 1; 2150 delalloc_start = delalloc_end + 1;
1943 continue; 2151 continue;
1944 } 2152 }
1945 tree->ops->fill_delalloc(inode, delalloc_start, 2153 tree->ops->fill_delalloc(inode, page, delalloc_start,
1946 delalloc_end); 2154 delalloc_end, &page_started);
1947 clear_extent_bit(tree, delalloc_start,
1948 delalloc_end,
1949 EXTENT_LOCKED | EXTENT_DELALLOC,
1950 1, 0, GFP_NOFS);
1951 delalloc_start = delalloc_end + 1; 2155 delalloc_start = delalloc_end + 1;
1952 } 2156 }
2157
2158 /* did the fill delalloc function already unlock and start the IO? */
2159 if (page_started) {
2160 return 0;
2161 }
2162
1953 lock_extent(tree, start, page_end, GFP_NOFS); 2163 lock_extent(tree, start, page_end, GFP_NOFS);
1954 unlock_start = start; 2164 unlock_start = start;
1955 2165
1956 if (tree->ops && tree->ops->writepage_start_hook) { 2166 if (tree->ops && tree->ops->writepage_start_hook) {
1957 ret = tree->ops->writepage_start_hook(page, start, page_end); 2167 ret = tree->ops->writepage_start_hook(page, start,
2168 page_end);
1958 if (ret == -EAGAIN) { 2169 if (ret == -EAGAIN) {
1959 unlock_extent(tree, start, page_end, GFP_NOFS); 2170 unlock_extent(tree, start, page_end, GFP_NOFS);
1960 redirty_page_for_writepage(wbc, page); 2171 redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2006 sector = (em->block_start + extent_offset) >> 9; 2217 sector = (em->block_start + extent_offset) >> 9;
2007 bdev = em->bdev; 2218 bdev = em->bdev;
2008 block_start = em->block_start; 2219 block_start = em->block_start;
2220 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2009 free_extent_map(em); 2221 free_extent_map(em);
2010 em = NULL; 2222 em = NULL;
2011 2223
2012 if (block_start == EXTENT_MAP_HOLE || 2224 /*
2225 * compressed and inline extents are written through other
2226 * paths in the FS
2227 */
2228 if (compressed || block_start == EXTENT_MAP_HOLE ||
2013 block_start == EXTENT_MAP_INLINE) { 2229 block_start == EXTENT_MAP_INLINE) {
2014 clear_extent_dirty(tree, cur, 2230 clear_extent_dirty(tree, cur,
2015 cur + iosize - 1, GFP_NOFS); 2231 cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2017 unlock_extent(tree, unlock_start, cur + iosize -1, 2233 unlock_extent(tree, unlock_start, cur + iosize -1,
2018 GFP_NOFS); 2234 GFP_NOFS);
2019 2235
2020 if (tree->ops && tree->ops->writepage_end_io_hook) 2236 /*
2237 * end_io notification does not happen here for
2238 * compressed extents
2239 */
2240 if (!compressed && tree->ops &&
2241 tree->ops->writepage_end_io_hook)
2021 tree->ops->writepage_end_io_hook(page, cur, 2242 tree->ops->writepage_end_io_hook(page, cur,
2022 cur + iosize - 1, 2243 cur + iosize - 1,
2023 NULL, 1); 2244 NULL, 1);
2024 cur = cur + iosize; 2245 else if (compressed) {
2246 /* we don't want to end_page_writeback on
2247 * a compressed extent. this happens
2248 * elsewhere
2249 */
2250 nr++;
2251 }
2252
2253 cur += iosize;
2025 pg_offset += iosize; 2254 pg_offset += iosize;
2026 unlock_start = cur; 2255 unlock_start = cur;
2027 continue; 2256 continue;
2028 } 2257 }
2029
2030 /* leave this out until we have a page_mkwrite call */ 2258 /* leave this out until we have a page_mkwrite call */
2031 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2259 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2032 EXTENT_DIRTY, 0)) { 2260 EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2034 pg_offset += iosize; 2262 pg_offset += iosize;
2035 continue; 2263 continue;
2036 } 2264 }
2265
2037 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 2266 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2038 if (tree->ops && tree->ops->writepage_io_hook) { 2267 if (tree->ops && tree->ops->writepage_io_hook) {
2039 ret = tree->ops->writepage_io_hook(page, cur, 2268 ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2057 ret = submit_extent_page(WRITE, tree, page, sector, 2286 ret = submit_extent_page(WRITE, tree, page, sector,
2058 iosize, pg_offset, bdev, 2287 iosize, pg_offset, bdev,
2059 &epd->bio, max_nr, 2288 &epd->bio, max_nr,
2060 end_bio_extent_writepage, 0); 2289 end_bio_extent_writepage,
2290 0, 0, 0);
2061 if (ret) 2291 if (ret)
2062 SetPageError(page); 2292 SetPageError(page);
2063 } 2293 }
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2226 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2456 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2227 __extent_writepage, &epd); 2457 __extent_writepage, &epd);
2228 if (epd.bio) { 2458 if (epd.bio) {
2229 submit_one_bio(WRITE, epd.bio, 0); 2459 submit_one_bio(WRITE, epd.bio, 0, 0);
2230 } 2460 }
2231 return ret; 2461 return ret;
2232} 2462}
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
2248 ret = extent_write_cache_pages(tree, mapping, wbc, 2478 ret = extent_write_cache_pages(tree, mapping, wbc,
2249 __extent_writepage, &epd); 2479 __extent_writepage, &epd);
2250 if (epd.bio) { 2480 if (epd.bio) {
2251 submit_one_bio(WRITE, epd.bio, 0); 2481 submit_one_bio(WRITE, epd.bio, 0, 0);
2252 } 2482 }
2253 return ret; 2483 return ret;
2254} 2484}
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
2262 struct bio *bio = NULL; 2492 struct bio *bio = NULL;
2263 unsigned page_idx; 2493 unsigned page_idx;
2264 struct pagevec pvec; 2494 struct pagevec pvec;
2495 unsigned long bio_flags = 0;
2265 2496
2266 pagevec_init(&pvec, 0); 2497 pagevec_init(&pvec, 0);
2267 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2498 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
2281 if (!pagevec_add(&pvec, page)) 2512 if (!pagevec_add(&pvec, page))
2282 __pagevec_lru_add(&pvec); 2513 __pagevec_lru_add(&pvec);
2283 __extent_read_full_page(tree, page, get_extent, 2514 __extent_read_full_page(tree, page, get_extent,
2284 &bio, 0); 2515 &bio, 0, &bio_flags);
2285 } 2516 }
2286 page_cache_release(page); 2517 page_cache_release(page);
2287 } 2518 }
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
2289 __pagevec_lru_add(&pvec); 2520 __pagevec_lru_add(&pvec);
2290 BUG_ON(!list_empty(pages)); 2521 BUG_ON(!list_empty(pages));
2291 if (bio) 2522 if (bio)
2292 submit_one_bio(READ, bio, 0); 2523 submit_one_bio(READ, bio, 0, bio_flags);
2293 return 0; 2524 return 0;
2294} 2525}
2295EXPORT_SYMBOL(extent_readpages); 2526EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2414 ret = submit_extent_page(READ, tree, page, 2645 ret = submit_extent_page(READ, tree, page,
2415 sector, iosize, page_offset, em->bdev, 2646 sector, iosize, page_offset, em->bdev,
2416 NULL, 1, 2647 NULL, 1,
2417 end_bio_extent_preparewrite, 0); 2648 end_bio_extent_preparewrite, 0,
2649 0, 0);
2418 iocount++; 2650 iocount++;
2419 block_start = block_start + iosize; 2651 block_start = block_start + iosize;
2420 } else { 2652 } else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2495 } 2727 }
2496 if (!test_range_bit(tree, em->start, 2728 if (!test_range_bit(tree, em->start,
2497 extent_map_end(em) - 1, 2729 extent_map_end(em) - 1,
2498 EXTENT_LOCKED, 0)) { 2730 EXTENT_LOCKED | EXTENT_WRITEBACK |
2731 EXTENT_ORDERED,
2732 0)) {
2499 remove_extent_mapping(map, em); 2733 remove_extent_mapping(map, em);
2500 /* once for the rb tree */ 2734 /* once for the rb tree */
2501 free_extent_map(em); 2735 free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2923 int inc_all_pages = 0; 3157 int inc_all_pages = 0;
2924 unsigned long num_pages; 3158 unsigned long num_pages;
2925 struct bio *bio = NULL; 3159 struct bio *bio = NULL;
3160 unsigned long bio_flags = 0;
2926 3161
2927 if (eb->flags & EXTENT_UPTODATE) 3162 if (eb->flags & EXTENT_UPTODATE)
2928 return 0; 3163 return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2973 ClearPageError(page); 3208 ClearPageError(page);
2974 err = __extent_read_full_page(tree, page, 3209 err = __extent_read_full_page(tree, page,
2975 get_extent, &bio, 3210 get_extent, &bio,
2976 mirror_num); 3211 mirror_num, &bio_flags);
2977 if (err) { 3212 if (err) {
2978 ret = err; 3213 ret = err;
2979 printk("err %d from __extent_read_full_page\n", ret); 3214 printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2984 } 3219 }
2985 3220
2986 if (bio) 3221 if (bio)
2987 submit_one_bio(READ, bio, mirror_num); 3222 submit_one_bio(READ, bio, mirror_num, bio_flags);
2988 3223
2989 if (ret || !wait) { 3224 if (ret || !wait) {
2990 if (ret) 3225 if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae..86f859b87a6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
18#define EXTENT_BOUNDARY (1 << 11) 18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
20 20
21/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1
23
21/* 24/*
22 * page->private values. Every page that is controlled by the extent 25 * page->private values. Every page that is controlled by the extent
23 * map has page->private set to one. 26 * map has page->private set to one.
@@ -28,14 +31,17 @@
28struct extent_state; 31struct extent_state;
29 32
30typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 33typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
31 struct bio *bio, int mirror_num); 34 struct bio *bio, int mirror_num,
35 unsigned long bio_flags);
32struct extent_io_ops { 36struct extent_io_ops {
33 int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); 37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
38 u64 start, u64 end, int *page_started);
34 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 39 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
35 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 40 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
36 extent_submit_bio_hook_t *submit_bio_hook; 41 extent_submit_bio_hook_t *submit_bio_hook;
37 int (*merge_bio_hook)(struct page *page, unsigned long offset, 42 int (*merge_bio_hook)(struct page *page, unsigned long offset,
38 size_t size, struct bio *bio); 43 size_t size, struct bio *bio,
44 unsigned long bio_flags);
39 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 45 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
40 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 46 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
41 u64 start, u64 end, 47 u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
245int release_extent_buffer_tail_pages(struct extent_buffer *eb); 251int release_extent_buffer_tail_pages(struct extent_buffer *eb);
246int extent_range_uptodate(struct extent_io_tree *tree, 252int extent_range_uptodate(struct extent_io_tree *tree,
247 u64 start, u64 end); 253 u64 start, u64 end);
254int extent_clear_unlock_delalloc(struct inode *inode,
255 struct extent_io_tree *tree,
256 u64 start, u64 end, struct page *locked_page,
257 int clear_dirty, int set_writeback,
258 int clear_writeback);
248#endif 259#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d..fd3ebfb8c3c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) 184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0; 185 return 0;
186 186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
187 if (extent_map_end(prev) == next->start && 194 if (extent_map_end(prev) == next->start &&
188 prev->flags == next->flags && 195 prev->flags == next->flags &&
189 prev->bdev == next->bdev && 196 prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
239 if (rb && mergable_maps(merge, em)) { 246 if (rb && mergable_maps(merge, em)) {
240 em->start = merge->start; 247 em->start = merge->start;
241 em->len += merge->len; 248 em->len += merge->len;
249 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 250 em->block_start = merge->block_start;
243 merge->in_tree = 0; 251 merge->in_tree = 0;
244 rb_erase(&merge->rb_node, &tree->map); 252 rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
250 merge = rb_entry(rb, struct extent_map, rb_node); 258 merge = rb_entry(rb, struct extent_map, rb_node);
251 if (rb && mergable_maps(em, merge)) { 259 if (rb && mergable_maps(em, merge)) {
252 em->len += merge->len; 260 em->len += merge->len;
261 em->block_len += merge->len;
253 rb_erase(&merge->rb_node, &tree->map); 262 rb_erase(&merge->rb_node, &tree->map);
254 merge->in_tree = 0; 263 merge->in_tree = 0;
255 free_extent_map(merge); 264 free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b26..abbcbeb28c7 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
10 10
11/* bits for the flags field */ 11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ 12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
13 14
14struct extent_map { 15struct extent_map {
15 struct rb_node rb_node; 16 struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
18 u64 start; 19 u64 start;
19 u64 len; 20 u64 len;
20 u64 block_start; 21 u64 block_start;
22 u64 block_len;
21 unsigned long flags; 23 unsigned long flags;
22 struct block_device *bdev; 24 struct block_device *bdev;
23 atomic_t refs; 25 atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
38 40
39static inline u64 extent_map_block_end(struct extent_map *em) 41static inline u64 extent_map_block_end(struct extent_map *em)
40{ 42{
41 if (em->block_start + em->len < em->block_start) 43 if (em->block_start + em->block_len < em->block_start)
42 return (u64)-1; 44 return (u64)-1;
43 return em->block_start + em->len; 45 return em->block_start + em->block_len;
44} 46}
45 47
46void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); 48void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d..f4d3fa71bc4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, 31 struct btrfs_root *root,
32 u64 objectid, u64 pos, 32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes, 33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset) 34 u64 num_bytes, u64 offset, u64 ram_bytes,
35 u8 compression, u8 encryption, u16 other_encoding)
35{ 36{
36 int ret = 0; 37 int ret = 0;
37 struct btrfs_file_extent_item *item; 38 struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
57 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); 58 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
58 btrfs_set_file_extent_offset(leaf, item, offset); 59 btrfs_set_file_extent_offset(leaf, item, offset);
59 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); 60 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
61 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
60 btrfs_set_file_extent_generation(leaf, item, trans->transid); 62 btrfs_set_file_extent_generation(leaf, item, trans->transid);
61 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 63 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
64 btrfs_set_file_extent_compression(leaf, item, compression);
65 btrfs_set_file_extent_encryption(leaf, item, encryption);
66 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
67
62 btrfs_mark_buffer_dirty(leaf); 68 btrfs_mark_buffer_dirty(leaf);
63out: 69out:
64 btrfs_free_path(path); 70 btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
213 return 0; 219 return 0;
214} 220}
215 221
222int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
223 u64 start, unsigned long len)
224{
225 struct btrfs_ordered_sum *sums;
226 struct btrfs_sector_sum *sector_sum;
227 struct btrfs_ordered_extent *ordered;
228 char *data;
229 struct page *page;
230 unsigned long total_bytes = 0;
231 unsigned long this_sum_bytes = 0;
232
233 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
234 if (!sums)
235 return -ENOMEM;
236
237 sector_sum = sums->sums;
238 sums->file_offset = start;
239 sums->len = len;
240 INIT_LIST_HEAD(&sums->list);
241 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
242 BUG_ON(!ordered);
243
244 while(len > 0) {
245 if (start >= ordered->file_offset + ordered->len ||
246 start < ordered->file_offset) {
247 sums->len = this_sum_bytes;
248 this_sum_bytes = 0;
249 btrfs_add_ordered_sum(inode, ordered, sums);
250 btrfs_put_ordered_extent(ordered);
251
252 sums = kzalloc(btrfs_ordered_sum_size(root, len),
253 GFP_NOFS);
254 BUG_ON(!sums);
255 sector_sum = sums->sums;
256 sums->len = len;
257 sums->file_offset = start;
258 ordered = btrfs_lookup_ordered_extent(inode,
259 sums->file_offset);
260 BUG_ON(!ordered);
261 }
262
263 page = find_get_page(inode->i_mapping,
264 start >> PAGE_CACHE_SHIFT);
265
266 data = kmap_atomic(page, KM_USER0);
267 sector_sum->sum = ~(u32)0;
268 sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
269 PAGE_CACHE_SIZE);
270 kunmap_atomic(data, KM_USER0);
271 btrfs_csum_final(sector_sum->sum,
272 (char *)&sector_sum->sum);
273 sector_sum->offset = page_offset(page);
274 page_cache_release(page);
275
276 sector_sum++;
277 total_bytes += PAGE_CACHE_SIZE;
278 this_sum_bytes += PAGE_CACHE_SIZE;
279 start += PAGE_CACHE_SIZE;
280
281 WARN_ON(len < PAGE_CACHE_SIZE);
282 len -= PAGE_CACHE_SIZE;
283 }
284 btrfs_add_ordered_sum(inode, ordered, sums);
285 btrfs_put_ordered_extent(ordered);
286 return 0;
287}
288
216int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 289int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
217 struct bio *bio) 290 struct bio *bio)
218{ 291{
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add..0aa15436590 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
95 } 95 }
96} 96}
97 97
98/* this does all the hard work for inserting an inline extent into
99 * the btree. Any existing inline extent is extended as required to make room,
100 * otherwise things are inserted as required into the btree
101 */
102static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root, struct inode *inode,
104 u64 offset, size_t size,
105 struct page **pages, size_t page_offset,
106 int num_pages)
107{
108 struct btrfs_key key;
109 struct btrfs_path *path;
110 struct extent_buffer *leaf;
111 char *kaddr;
112 unsigned long ptr;
113 struct btrfs_file_extent_item *ei;
114 struct page *page;
115 u32 datasize;
116 int err = 0;
117 int ret;
118 int i;
119 ssize_t cur_size;
120
121 path = btrfs_alloc_path();
122 if (!path)
123 return -ENOMEM;
124
125 btrfs_set_trans_block_group(trans, inode);
126
127 key.objectid = inode->i_ino;
128 key.offset = offset;
129 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
130
131 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
132 if (ret < 0) {
133 err = ret;
134 goto fail;
135 }
136 if (ret == 1) {
137 struct btrfs_key found_key;
138
139 if (path->slots[0] == 0)
140 goto insert;
141
142 path->slots[0]--;
143 leaf = path->nodes[0];
144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
145
146 if (found_key.objectid != inode->i_ino)
147 goto insert;
148
149 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
150 goto insert;
151 ei = btrfs_item_ptr(leaf, path->slots[0],
152 struct btrfs_file_extent_item);
153
154 if (btrfs_file_extent_type(leaf, ei) !=
155 BTRFS_FILE_EXTENT_INLINE) {
156 goto insert;
157 }
158 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
159 ret = 0;
160 }
161 if (ret == 0) {
162 u32 found_size;
163 u64 found_end;
164
165 leaf = path->nodes[0];
166 ei = btrfs_item_ptr(leaf, path->slots[0],
167 struct btrfs_file_extent_item);
168
169 if (btrfs_file_extent_type(leaf, ei) !=
170 BTRFS_FILE_EXTENT_INLINE) {
171 err = ret;
172 btrfs_print_leaf(root, leaf);
173 printk("found wasn't inline offset %Lu inode %lu\n",
174 offset, inode->i_ino);
175 goto fail;
176 }
177 found_size = btrfs_file_extent_inline_len(leaf,
178 btrfs_item_nr(leaf, path->slots[0]));
179 found_end = key.offset + found_size;
180
181 if (found_end < offset + size) {
182 btrfs_release_path(root, path);
183 ret = btrfs_search_slot(trans, root, &key, path,
184 offset + size - found_end, 1);
185 BUG_ON(ret != 0);
186
187 ret = btrfs_extend_item(trans, root, path,
188 offset + size - found_end);
189 if (ret) {
190 err = ret;
191 goto fail;
192 }
193 leaf = path->nodes[0];
194 ei = btrfs_item_ptr(leaf, path->slots[0],
195 struct btrfs_file_extent_item);
196 inode_add_bytes(inode, offset + size - found_end);
197 }
198 if (found_end < offset) {
199 ptr = btrfs_file_extent_inline_start(ei) + found_size;
200 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
201 }
202 } else {
203insert:
204 btrfs_release_path(root, path);
205 datasize = offset + size - key.offset;
206 inode_add_bytes(inode, datasize);
207 datasize = btrfs_file_extent_calc_inline_size(datasize);
208 ret = btrfs_insert_empty_item(trans, root, path, &key,
209 datasize);
210 if (ret) {
211 err = ret;
212 printk("got bad ret %d\n", ret);
213 goto fail;
214 }
215 leaf = path->nodes[0];
216 ei = btrfs_item_ptr(leaf, path->slots[0],
217 struct btrfs_file_extent_item);
218 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
219 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
220 }
221 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
222
223 cur_size = size;
224 i = 0;
225 while (size > 0) {
226 page = pages[i];
227 kaddr = kmap_atomic(page, KM_USER0);
228 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
229 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
230 kunmap_atomic(kaddr, KM_USER0);
231 page_offset = 0;
232 ptr += cur_size;
233 size -= cur_size;
234 if (i >= num_pages) {
235 printk("i %d num_pages %d\n", i, num_pages);
236 }
237 i++;
238 }
239 btrfs_mark_buffer_dirty(leaf);
240fail:
241 btrfs_free_path(path);
242 return err;
243}
244
245/* 98/*
246 * after copy_from_user, pages need to be dirtied and we need to make 99 * after copy_from_user, pages need to be dirtied and we need to make
247 * sure holes are created between the current EOF and the start of 100 * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
267 u64 start_pos; 120 u64 start_pos;
268 u64 end_of_last_block; 121 u64 end_of_last_block;
269 u64 end_pos = pos + write_bytes; 122 u64 end_pos = pos + write_bytes;
270 u64 inline_size;
271 int did_inline = 0;
272 loff_t isize = i_size_read(inode); 123 loff_t isize = i_size_read(inode);
273 124
274 start_pos = pos & ~((u64)root->sectorsize - 1); 125 start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
314 err = btrfs_insert_file_extent(trans, root, 165 err = btrfs_insert_file_extent(trans, root,
315 inode->i_ino, 166 inode->i_ino,
316 last_pos_in_file, 167 last_pos_in_file,
317 0, 0, hole_size, 0); 168 0, 0, hole_size, 0,
169 hole_size, 0, 0, 0);
318 btrfs_drop_extent_cache(inode, last_pos_in_file, 170 btrfs_drop_extent_cache(inode, last_pos_in_file,
319 last_pos_in_file + hole_size - 1, 0); 171 last_pos_in_file + hole_size - 1, 0);
320 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 172 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
324 goto failed; 176 goto failed;
325 } 177 }
326 178
327 /* 179 /* check for reserved extents on each page, we don't want
328 * either allocate an extent for the new bytes or setup the key 180 * to reset the delalloc bit on things that already have
329 * to show we are doing inline data in the extent 181 * extents reserved.
330 */ 182 */
331 inline_size = end_pos; 183 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
332 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 184 for (i = 0; i < num_pages; i++) {
333 inline_size > root->fs_info->max_inline || 185 struct page *p = pages[i];
334 (inline_size & (root->sectorsize -1)) == 0 || 186 SetPageUptodate(p);
335 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { 187 ClearPageChecked(p);
336 /* check for reserved extents on each page, we don't want 188 set_page_dirty(p);
337 * to reset the delalloc bit on things that already have
338 * extents reserved.
339 */
340 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
341 for (i = 0; i < num_pages; i++) {
342 struct page *p = pages[i];
343 SetPageUptodate(p);
344 ClearPageChecked(p);
345 set_page_dirty(p);
346 }
347 } else {
348 u64 aligned_end;
349 /* step one, delete the existing extents in this range */
350 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
351 ~((u64)root->sectorsize - 1);
352 mutex_lock(&BTRFS_I(inode)->extent_mutex);
353 err = btrfs_drop_extents(trans, root, inode, start_pos,
354 aligned_end, aligned_end, &hint_byte);
355 if (err)
356 goto failed;
357 if (isize > inline_size)
358 inline_size = min_t(u64, isize, aligned_end);
359 inline_size -= start_pos;
360 err = insert_inline_extent(trans, root, inode, start_pos,
361 inline_size, pages, 0, num_pages);
362 btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
363 BUG_ON(err);
364 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
365
366 /*
367 * an ugly way to do all the prop accounting around
368 * the page bits and mapping tags
369 */
370 set_page_writeback(pages[0]);
371 end_page_writeback(pages[0]);
372 did_inline = 1;
373 } 189 }
374 if (end_pos > isize) { 190 if (end_pos > isize) {
375 i_size_write(inode, end_pos); 191 i_size_write(inode, end_pos);
376 if (did_inline)
377 BTRFS_I(inode)->disk_i_size = end_pos;
378 btrfs_update_inode(trans, root, inode); 192 btrfs_update_inode(trans, root, inode);
379 } 193 }
380failed: 194failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
399 int ret; 213 int ret;
400 int testend = 1; 214 int testend = 1;
401 unsigned long flags; 215 unsigned long flags;
216 int compressed = 0;
402 217
403 WARN_ON(end < start); 218 WARN_ON(end < start);
404 if (end == (u64)-1) { 219 if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
434 free_extent_map(em); 249 free_extent_map(em);
435 continue; 250 continue;
436 } 251 }
252 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
437 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 253 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
438 remove_extent_mapping(em_tree, em); 254 remove_extent_mapping(em_tree, em);
439 255
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
442 split->start = em->start; 258 split->start = em->start;
443 split->len = start - em->start; 259 split->len = start - em->start;
444 split->block_start = em->block_start; 260 split->block_start = em->block_start;
261
262 if (compressed)
263 split->block_len = em->block_len;
264 else
265 split->block_len = split->len;
266
445 split->bdev = em->bdev; 267 split->bdev = em->bdev;
446 split->flags = flags; 268 split->flags = flags;
447 ret = add_extent_mapping(em_tree, split); 269 ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
459 split->bdev = em->bdev; 281 split->bdev = em->bdev;
460 split->flags = flags; 282 split->flags = flags;
461 283
462 split->block_start = em->block_start + diff; 284 if (compressed) {
285 split->block_len = em->block_len;
286 split->block_start = em->block_start;
287 } else {
288 split->block_len = split->len;
289 split->block_start = em->block_start + diff;
290 }
463 291
464 ret = add_extent_mapping(em_tree, split); 292 ret = add_extent_mapping(em_tree, split);
465 BUG_ON(ret); 293 BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
533 struct btrfs_item *item; 361 struct btrfs_item *item;
534 item = btrfs_item_nr(leaf, slot); 362 item = btrfs_item_nr(leaf, slot);
535 extent_end = found_key.offset + 363 extent_end = found_key.offset +
536 btrfs_file_extent_inline_len(leaf, item); 364 btrfs_file_extent_inline_len(leaf, extent);
537 extent_end = (extent_end + root->sectorsize - 1) & 365 extent_end = (extent_end + root->sectorsize - 1) &
538 ~((u64)root->sectorsize -1 ); 366 ~((u64)root->sectorsize -1 );
539 } 367 }
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
573 u64 extent_end = 0; 401 u64 extent_end = 0;
574 u64 search_start = start; 402 u64 search_start = start;
575 u64 leaf_start; 403 u64 leaf_start;
404 u64 ram_bytes = 0;
405 u8 compression = 0;
406 u8 encryption = 0;
407 u16 other_encoding = 0;
576 u64 root_gen; 408 u64 root_gen;
577 u64 root_owner; 409 u64 root_owner;
578 struct extent_buffer *leaf; 410 struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
589 int recow; 421 int recow;
590 int ret; 422 int ret;
591 423
424 inline_limit = 0;
592 btrfs_drop_extent_cache(inode, start, end - 1, 0); 425 btrfs_drop_extent_cache(inode, start, end - 1, 0);
593 426
594 path = btrfs_alloc_path(); 427 path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
637 extent = btrfs_item_ptr(leaf, slot, 470 extent = btrfs_item_ptr(leaf, slot,
638 struct btrfs_file_extent_item); 471 struct btrfs_file_extent_item);
639 found_type = btrfs_file_extent_type(leaf, extent); 472 found_type = btrfs_file_extent_type(leaf, extent);
473 compression = btrfs_file_extent_compression(leaf,
474 extent);
475 encryption = btrfs_file_extent_encryption(leaf,
476 extent);
477 other_encoding = btrfs_file_extent_other_encoding(leaf,
478 extent);
640 if (found_type == BTRFS_FILE_EXTENT_REG) { 479 if (found_type == BTRFS_FILE_EXTENT_REG) {
641 extent_end = 480 extent_end =
642 btrfs_file_extent_disk_bytenr(leaf, 481 btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
646 485
647 extent_end = key.offset + 486 extent_end = key.offset +
648 btrfs_file_extent_num_bytes(leaf, extent); 487 btrfs_file_extent_num_bytes(leaf, extent);
488 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
489 extent);
649 found_extent = 1; 490 found_extent = 1;
650 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 491 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
651 struct btrfs_item *item;
652 item = btrfs_item_nr(leaf, slot);
653 found_inline = 1; 492 found_inline = 1;
654 extent_end = key.offset + 493 extent_end = key.offset +
655 btrfs_file_extent_inline_len(leaf, item); 494 btrfs_file_extent_inline_len(leaf, extent);
656 } 495 }
657 } else { 496 } else {
658 extent_end = search_start; 497 extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
680 search_start = (extent_end + mask) & ~mask; 519 search_start = (extent_end + mask) & ~mask;
681 } else 520 } else
682 search_start = extent_end; 521 search_start = extent_end;
683 if (end <= extent_end && start >= key.offset && found_inline) { 522
523 if (end <= extent_end && start >= key.offset && found_inline)
684 *hint_byte = EXTENT_MAP_INLINE; 524 *hint_byte = EXTENT_MAP_INLINE;
685 goto out;
686 }
687 525
688 if (found_extent) { 526 if (found_extent) {
689 read_extent_buffer(leaf, &old, (unsigned long)extent, 527 read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
770 write_extent_buffer(leaf, &old, 608 write_extent_buffer(leaf, &old,
771 (unsigned long)extent, sizeof(old)); 609 (unsigned long)extent, sizeof(old));
772 610
611 btrfs_set_file_extent_compression(leaf, extent,
612 compression);
613 btrfs_set_file_extent_encryption(leaf, extent,
614 encryption);
615 btrfs_set_file_extent_other_encoding(leaf, extent,
616 other_encoding);
773 btrfs_set_file_extent_offset(leaf, extent, 617 btrfs_set_file_extent_offset(leaf, extent,
774 le64_to_cpu(old.offset) + end - key.offset); 618 le64_to_cpu(old.offset) + end - key.offset);
775 WARN_ON(le64_to_cpu(old.num_bytes) < 619 WARN_ON(le64_to_cpu(old.num_bytes) <
776 (extent_end - end)); 620 (extent_end - end));
777 btrfs_set_file_extent_num_bytes(leaf, extent, 621 btrfs_set_file_extent_num_bytes(leaf, extent,
778 extent_end - end); 622 extent_end - end);
623
624 /*
625 * set the ram bytes to the size of the full extent
626 * before splitting. This is a worst case flag,
627 * but its the best we can do because we don't know
628 * how splitting affects compression
629 */
630 btrfs_set_file_extent_ram_bytes(leaf, extent,
631 ram_bytes);
779 btrfs_set_file_extent_type(leaf, extent, 632 btrfs_set_file_extent_type(leaf, extent,
780 BTRFS_FILE_EXTENT_REG); 633 BTRFS_FILE_EXTENT_REG);
781 634
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d..9797592dc86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
49#include "compat.h" 49#include "compat.h"
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h" 51#include "ref-cache.h"
52#include "compression.h"
52 53
53struct btrfs_iget_args { 54struct btrfs_iget_args {
54 u64 ino; 55 u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
83}; 84};
84 85
85static void btrfs_truncate(struct inode *inode); 86static void btrfs_truncate(struct inode *inode);
87static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
86 88
87/* 89/*
88 * a very lame attempt at stopping writes when the FS is 85% full. There 90 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
114} 116}
115 117
116/* 118/*
119 * this does all the hard work for inserting an inline extent into
120 * the btree. The caller should have done a btrfs_drop_extents so that
121 * no overlapping inline items exist in the btree
122 */
123static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
124 struct btrfs_root *root, struct inode *inode,
125 u64 start, size_t size, size_t compressed_size,
126 struct page **compressed_pages)
127{
128 struct btrfs_key key;
129 struct btrfs_path *path;
130 struct extent_buffer *leaf;
131 struct page *page = NULL;
132 char *kaddr;
133 unsigned long ptr;
134 struct btrfs_file_extent_item *ei;
135 int err = 0;
136 int ret;
137 size_t cur_size = size;
138 size_t datasize;
139 unsigned long offset;
140 int use_compress = 0;
141
142 if (compressed_size && compressed_pages) {
143 use_compress = 1;
144 cur_size = compressed_size;
145 }
146
147 path = btrfs_alloc_path(); if (!path)
148 return -ENOMEM;
149
150 btrfs_set_trans_block_group(trans, inode);
151
152 key.objectid = inode->i_ino;
153 key.offset = start;
154 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
155 inode_add_bytes(inode, size);
156 datasize = btrfs_file_extent_calc_inline_size(cur_size);
157
158 inode_add_bytes(inode, size);
159 ret = btrfs_insert_empty_item(trans, root, path, &key,
160 datasize);
161 BUG_ON(ret);
162 if (ret) {
163 err = ret;
164 printk("got bad ret %d\n", ret);
165 goto fail;
166 }
167 leaf = path->nodes[0];
168 ei = btrfs_item_ptr(leaf, path->slots[0],
169 struct btrfs_file_extent_item);
170 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172 btrfs_set_file_extent_encryption(leaf, ei, 0);
173 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175 ptr = btrfs_file_extent_inline_start(ei);
176
177 if (use_compress) {
178 struct page *cpage;
179 int i = 0;
180 while(compressed_size > 0) {
181 cpage = compressed_pages[i];
182 cur_size = min(compressed_size,
183 PAGE_CACHE_SIZE);
184
185 kaddr = kmap(cpage);
186 write_extent_buffer(leaf, kaddr, ptr, cur_size);
187 kunmap(cpage);
188
189 i++;
190 ptr += cur_size;
191 compressed_size -= cur_size;
192 }
193 btrfs_set_file_extent_compression(leaf, ei,
194 BTRFS_COMPRESS_ZLIB);
195 } else {
196 page = find_get_page(inode->i_mapping,
197 start >> PAGE_CACHE_SHIFT);
198 btrfs_set_file_extent_compression(leaf, ei, 0);
199 kaddr = kmap_atomic(page, KM_USER0);
200 offset = start & (PAGE_CACHE_SIZE - 1);
201 write_extent_buffer(leaf, kaddr + offset, ptr, size);
202 kunmap_atomic(kaddr, KM_USER0);
203 page_cache_release(page);
204 }
205 btrfs_mark_buffer_dirty(leaf);
206 btrfs_free_path(path);
207
208 BTRFS_I(inode)->disk_i_size = inode->i_size;
209 btrfs_update_inode(trans, root, inode);
210 return 0;
211fail:
212 btrfs_free_path(path);
213 return err;
214}
215
216
217/*
218 * conditionally insert an inline extent into the file. This
219 * does the checks required to make sure the data is small enough
220 * to fit as an inline extent.
221 */
222static int cow_file_range_inline(struct btrfs_trans_handle *trans,
223 struct btrfs_root *root,
224 struct inode *inode, u64 start, u64 end,
225 size_t compressed_size,
226 struct page **compressed_pages)
227{
228 u64 isize = i_size_read(inode);
229 u64 actual_end = min(end + 1, isize);
230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len;
235 int ret;
236
237 if (compressed_size)
238 data_len = compressed_size;
239
240 if (start > 0 ||
241 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
242 (!compressed_size &&
243 (actual_end & (root->sectorsize - 1)) == 0) ||
244 end + 1 < isize ||
245 data_len > root->fs_info->max_inline) {
246 return 1;
247 }
248
249 mutex_lock(&BTRFS_I(inode)->extent_mutex);
250 ret = btrfs_drop_extents(trans, root, inode, start,
251 aligned_end, aligned_end, &hint_byte);
252 BUG_ON(ret);
253
254 if (isize > actual_end)
255 inline_len = min_t(u64, isize, actual_end);
256 ret = insert_inline_extent(trans, root, inode, start,
257 inline_len, compressed_size,
258 compressed_pages);
259 BUG_ON(ret);
260 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
261 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
262 return 0;
263}
264
265/*
117 * when extent_io.c finds a delayed allocation range in the file, 266 * when extent_io.c finds a delayed allocation range in the file,
118 * the call backs end up in this code. The basic idea is to 267 * the call backs end up in this code. The basic idea is to
119 * allocate extents on disk for the range, and create ordered data structs 268 * allocate extents on disk for the range, and create ordered data structs
120 * in ram to track those extents. 269 * in ram to track those extents.
270 *
271 * locked_page is the page that writepage had locked already. We use
272 * it to make sure we don't do extra locks or unlocks.
273 *
274 * *page_started is set to one if we unlock locked_page and do everything
275 * required to start IO on it. It may be clean and already done with
276 * IO when we return.
121 */ 277 */
122static int cow_file_range(struct inode *inode, u64 start, u64 end) 278static int cow_file_range(struct inode *inode, struct page *locked_page,
279 u64 start, u64 end, int *page_started)
123{ 280{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 281 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct btrfs_trans_handle *trans; 282 struct btrfs_trans_handle *trans;
126 u64 alloc_hint = 0; 283 u64 alloc_hint = 0;
127 u64 num_bytes; 284 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start;
287 u64 disk_num_bytes;
128 u64 cur_alloc_size; 288 u64 cur_alloc_size;
129 u64 blocksize = root->sectorsize; 289 u64 blocksize = root->sectorsize;
130 u64 orig_num_bytes; 290 u64 actual_end;
131 struct btrfs_key ins; 291 struct btrfs_key ins;
132 struct extent_map *em; 292 struct extent_map *em;
133 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
134 int ret = 0; 294 int ret = 0;
295 struct page **pages = NULL;
296 unsigned long nr_pages;
297 unsigned long nr_pages_ret = 0;
298 unsigned long total_compressed = 0;
299 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024;
302 int i;
303 int will_compress;
135 304
136 trans = btrfs_join_transaction(root, 1); 305 trans = btrfs_join_transaction(root, 1);
137 BUG_ON(!trans); 306 BUG_ON(!trans);
138 btrfs_set_trans_block_group(trans, inode); 307 btrfs_set_trans_block_group(trans, inode);
308 orig_start = start;
309
310 /*
311 * compression made this loop a bit ugly, but the basic idea is to
312 * compress some pages but keep the total size of the compressed
313 * extent relatively small. If compression is off, this goto target
314 * is never used.
315 */
316again:
317 will_compress = 0;
318 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
319 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
139 320
321 actual_end = min_t(u64, i_size_read(inode), end + 1);
322 total_compressed = actual_end - start;
323
324 /* we want to make sure that amount of ram required to uncompress
325 * an extent is reasonable, so we limit the total size in ram
326 * of a compressed extent to 256k
327 */
328 total_compressed = min(total_compressed, max_uncompressed);
140 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 329 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
141 num_bytes = max(blocksize, num_bytes); 330 num_bytes = max(blocksize, num_bytes);
142 orig_num_bytes = num_bytes; 331 disk_num_bytes = num_bytes;
332 total_in = 0;
333 ret = 0;
143 334
144 if (alloc_hint == EXTENT_MAP_INLINE) 335 /* we do compression for mount -o compress and when the
145 goto out; 336 * inode has not been flagged as nocompress
337 */
338 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
339 btrfs_test_opt(root, COMPRESS)) {
340 WARN_ON(pages);
341 pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
342
343 /* we want to make sure the amount of IO required to satisfy
344 * a random read is reasonably small, so we limit the size
345 * of a compressed extent to 128k
346 */
347 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
348 total_compressed, pages,
349 nr_pages, &nr_pages_ret,
350 &total_in,
351 &total_compressed,
352 max_compressed);
353
354 if (!ret) {
355 unsigned long offset = total_compressed &
356 (PAGE_CACHE_SIZE - 1);
357 struct page *page = pages[nr_pages_ret - 1];
358 char *kaddr;
359
360 /* zero the tail end of the last page, we might be
361 * sending it down to disk
362 */
363 if (offset) {
364 kaddr = kmap_atomic(page, KM_USER0);
365 memset(kaddr + offset, 0,
366 PAGE_CACHE_SIZE - offset);
367 kunmap_atomic(kaddr, KM_USER0);
368 }
369 will_compress = 1;
370 }
371 }
372 if (start == 0) {
373 /* lets try to make an inline extent */
374 if (ret || total_in < (end - start + 1)) {
375 /* we didn't compress the entire range, try
376 * to make an uncompressed inline extent. This
377 * is almost sure to fail, but maybe inline sizes
378 * will get bigger later
379 */
380 ret = cow_file_range_inline(trans, root, inode,
381 start, end, 0, NULL);
382 } else {
383 ret = cow_file_range_inline(trans, root, inode,
384 start, end,
385 total_compressed, pages);
386 }
387 if (ret == 0) {
388 extent_clear_unlock_delalloc(inode,
389 &BTRFS_I(inode)->io_tree,
390 start, end, NULL,
391 1, 1, 1);
392 *page_started = 1;
393 ret = 0;
394 goto free_pages_out;
395 }
396 }
397
398 if (will_compress) {
399 /*
400 * we aren't doing an inline extent round the compressed size
401 * up to a block size boundary so the allocator does sane
402 * things
403 */
404 total_compressed = (total_compressed + blocksize - 1) &
405 ~(blocksize - 1);
406
407 /*
408 * one last check to make sure the compression is really a
409 * win, compare the page count read with the blocks on disk
410 */
411 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
412 ~(PAGE_CACHE_SIZE - 1);
413 if (total_compressed >= total_in) {
414 will_compress = 0;
415 } else {
416 disk_num_bytes = total_compressed;
417 num_bytes = total_in;
418 }
419 }
420 if (!will_compress && pages) {
421 /*
422 * the compression code ran but failed to make things smaller,
423 * free any pages it allocated and our page pointer array
424 */
425 for (i = 0; i < nr_pages_ret; i++) {
426 page_cache_release(pages[i]);
427 }
428 kfree(pages);
429 pages = NULL;
430 total_compressed = 0;
431 nr_pages_ret = 0;
432
433 /* flag the file so we don't compress in the future */
434 btrfs_set_flag(inode, NOCOMPRESS);
435 }
436
437 BUG_ON(disk_num_bytes >
438 btrfs_super_total_bytes(&root->fs_info->super_copy));
146 439
147 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
148 mutex_lock(&BTRFS_I(inode)->extent_mutex); 440 mutex_lock(&BTRFS_I(inode)->extent_mutex);
149 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 441 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
150 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 442 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
151 443
152 while(num_bytes > 0) { 444 while(disk_num_bytes > 0) {
153 cur_alloc_size = min(num_bytes, root->fs_info->max_extent); 445 unsigned long min_bytes;
446
447 /*
448 * the max size of a compressed extent is pretty small,
449 * make the code a little less complex by forcing
450 * the allocator to find a whole compressed extent at once
451 */
452 if (will_compress)
453 min_bytes = disk_num_bytes;
454 else
455 min_bytes = root->sectorsize;
456
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
154 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
155 root->sectorsize, 0, alloc_hint, 459 min_bytes, 0, alloc_hint,
156 (u64)-1, &ins, 1); 460 (u64)-1, &ins, 1);
157 if (ret) { 461 if (ret) {
158 WARN_ON(1); 462 WARN_ON(1);
159 goto out; 463 goto free_pages_out_fail;
160 } 464 }
161 em = alloc_extent_map(GFP_NOFS); 465 em = alloc_extent_map(GFP_NOFS);
162 em->start = start; 466 em->start = start;
163 em->len = ins.offset; 467
468 if (will_compress) {
469 ram_size = num_bytes;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476
164 em->block_start = ins.objectid; 477 em->block_start = ins.objectid;
478 em->block_len = ins.offset;
165 em->bdev = root->fs_info->fs_devices->latest_bdev; 479 em->bdev = root->fs_info->fs_devices->latest_bdev;
480
166 mutex_lock(&BTRFS_I(inode)->extent_mutex); 481 mutex_lock(&BTRFS_I(inode)->extent_mutex);
167 set_bit(EXTENT_FLAG_PINNED, &em->flags); 482 set_bit(EXTENT_FLAG_PINNED, &em->flags);
483
484 if (will_compress)
485 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
486
168 while(1) { 487 while(1) {
169 spin_lock(&em_tree->lock); 488 spin_lock(&em_tree->lock);
170 ret = add_extent_mapping(em_tree, em); 489 ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
174 break; 493 break;
175 } 494 }
176 btrfs_drop_extent_cache(inode, start, 495 btrfs_drop_extent_cache(inode, start,
177 start + ins.offset - 1, 0); 496 start + ram_size - 1, 0);
178 } 497 }
179 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 498 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
180 499
181 cur_alloc_size = ins.offset; 500 cur_alloc_size = ins.offset;
182 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 501 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
183 ins.offset, 0); 502 ram_size, cur_alloc_size, 0,
503 will_compress);
184 BUG_ON(ret); 504 BUG_ON(ret);
185 if (num_bytes < cur_alloc_size) { 505
186 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, 506 if (disk_num_bytes < cur_alloc_size) {
507 printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
187 cur_alloc_size); 508 cur_alloc_size);
188 break; 509 break;
189 } 510 }
511
512 if (will_compress) {
513 /*
514 * we're doing compression, we and we need to
515 * submit the compressed extents down to the device.
516 *
517 * We lock down all the file pages, clearing their
518 * dirty bits and setting them writeback. Everyone
519 * that wants to modify the page will wait on the
520 * ordered extent above.
521 *
522 * The writeback bits on the file pages are
523 * cleared when the compressed pages are on disk
524 */
525 btrfs_end_transaction(trans, root);
526
527 if (start <= page_offset(locked_page) &&
528 page_offset(locked_page) < start + ram_size) {
529 *page_started = 1;
530 }
531
532 extent_clear_unlock_delalloc(inode,
533 &BTRFS_I(inode)->io_tree,
534 start,
535 start + ram_size - 1,
536 NULL, 1, 1, 0);
537
538 ret = btrfs_submit_compressed_write(inode, start,
539 ram_size, ins.objectid,
540 cur_alloc_size, pages,
541 nr_pages_ret);
542
543 BUG_ON(ret);
544 trans = btrfs_join_transaction(root, 1);
545 if (start + ram_size < end) {
546 start += ram_size;
547 alloc_hint = ins.objectid + ins.offset;
548 /* pages will be freed at end_bio time */
549 pages = NULL;
550 goto again;
551 } else {
552 /* we've written everything, time to go */
553 break;
554 }
555 }
556 /* we're not doing compressed IO, don't unlock the first
557 * page (which the caller expects to stay locked), don't
558 * clear any dirty bits and don't set any writeback bits
559 */
560 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
561 start, start + ram_size - 1,
562 locked_page, 0, 0, 0);
563 disk_num_bytes -= cur_alloc_size;
190 num_bytes -= cur_alloc_size; 564 num_bytes -= cur_alloc_size;
191 alloc_hint = ins.objectid + ins.offset; 565 alloc_hint = ins.objectid + ins.offset;
192 start += cur_alloc_size; 566 start += cur_alloc_size;
193 } 567 }
568
569 ret = 0;
194out: 570out:
195 btrfs_end_transaction(trans, root); 571 btrfs_end_transaction(trans, root);
572
196 return ret; 573 return ret;
574
575free_pages_out_fail:
576 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
577 start, end, locked_page, 0, 0, 0);
578free_pages_out:
579 for (i = 0; i < nr_pages_ret; i++)
580 page_cache_release(pages[i]);
581 if (pages)
582 kfree(pages);
583
584 goto out;
197} 585}
198 586
199/* 587/*
@@ -203,7 +591,8 @@ out:
203 * If no cow copies or snapshots exist, we write directly to the existing 591 * If no cow copies or snapshots exist, we write directly to the existing
204 * blocks on disk 592 * blocks on disk
205 */ 593 */
206static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) 594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started)
207{ 596{
208 u64 extent_start; 597 u64 extent_start;
209 u64 extent_end; 598 u64 extent_end;
@@ -260,6 +649,11 @@ again:
260 extent_end = extent_start + extent_num_bytes; 649 extent_end = extent_start + extent_num_bytes;
261 err = 0; 650 err = 0;
262 651
652 if (btrfs_file_extent_compression(leaf, item) ||
653 btrfs_file_extent_encryption(leaf,item) ||
654 btrfs_file_extent_other_encoding(leaf, item))
655 goto not_found;
656
263 if (loops && start != extent_start) 657 if (loops && start != extent_start)
264 goto not_found; 658 goto not_found;
265 659
@@ -284,7 +678,8 @@ again:
284 bytenr += btrfs_file_extent_offset(leaf, item); 678 bytenr += btrfs_file_extent_offset(leaf, item);
285 extent_num_bytes = min(end + 1, extent_end) - start; 679 extent_num_bytes = min(end + 1, extent_end) - start;
286 ret = btrfs_add_ordered_extent(inode, start, bytenr, 680 ret = btrfs_add_ordered_extent(inode, start, bytenr,
287 extent_num_bytes, 1); 681 extent_num_bytes,
682 extent_num_bytes, 1, 0);
288 if (ret) { 683 if (ret) {
289 err = ret; 684 err = ret;
290 goto out; 685 goto out;
@@ -300,7 +695,8 @@ again:
300not_found: 695not_found:
301 btrfs_end_transaction(trans, root); 696 btrfs_end_transaction(trans, root);
302 btrfs_free_path(path); 697 btrfs_free_path(path);
303 return cow_file_range(inode, start, end); 698 return cow_file_range(inode, locked_page, start, end,
699 page_started);
304 } 700 }
305out: 701out:
306 WARN_ON(err); 702 WARN_ON(err);
@@ -312,16 +708,19 @@ out:
312/* 708/*
313 * extent_io.c call back to do delayed allocation processing 709 * extent_io.c call back to do delayed allocation processing
314 */ 710 */
315static int run_delalloc_range(struct inode *inode, u64 start, u64 end) 711static int run_delalloc_range(struct inode *inode, struct page *locked_page,
712 u64 start, u64 end, int *page_started)
316{ 713{
317 struct btrfs_root *root = BTRFS_I(inode)->root; 714 struct btrfs_root *root = BTRFS_I(inode)->root;
318 int ret; 715 int ret;
319 716
320 if (btrfs_test_opt(root, NODATACOW) || 717 if (btrfs_test_opt(root, NODATACOW) ||
321 btrfs_test_flag(inode, NODATACOW)) 718 btrfs_test_flag(inode, NODATACOW))
322 ret = run_delalloc_nocow(inode, start, end); 719 ret = run_delalloc_nocow(inode, locked_page, start, end,
720 page_started);
323 else 721 else
324 ret = cow_file_range(inode, start, end); 722 ret = cow_file_range(inode, locked_page, start, end,
723 page_started);
325 724
326 return ret; 725 return ret;
327} 726}
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
383 * we don't create bios that span stripes or chunks 782 * we don't create bios that span stripes or chunks
384 */ 783 */
385int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 784int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
386 size_t size, struct bio *bio) 785 size_t size, struct bio *bio,
786 unsigned long bio_flags)
387{ 787{
388 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 788 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
389 struct btrfs_mapping_tree *map_tree; 789 struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
413 * are inserted into the btree 813 * are inserted into the btree
414 */ 814 */
415int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 815int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
416 int mirror_num) 816 int mirror_num, unsigned long bio_flags)
417{ 817{
418 struct btrfs_root *root = BTRFS_I(inode)->root; 818 struct btrfs_root *root = BTRFS_I(inode)->root;
419 int ret = 0; 819 int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
429 * or reading the csums from the tree before a read 829 * or reading the csums from the tree before a read
430 */ 830 */
431int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 831int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
432 int mirror_num) 832 int mirror_num, unsigned long bio_flags)
433{ 833{
434 struct btrfs_root *root = BTRFS_I(inode)->root; 834 struct btrfs_root *root = BTRFS_I(inode)->root;
435 int ret = 0; 835 int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
444 844
445 if (!(rw & (1 << BIO_RW))) { 845 if (!(rw & (1 << BIO_RW))) {
446 btrfs_lookup_bio_sums(root, inode, bio); 846 btrfs_lookup_bio_sums(root, inode, bio);
847
848 if (bio_flags & EXTENT_BIO_COMPRESSED) {
849 return btrfs_submit_compressed_read(inode, bio,
850 mirror_num, bio_flags);
851 }
852
447 goto mapit; 853 goto mapit;
448 } 854 }
449 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 855 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
450 inode, rw, bio, mirror_num, 856 inode, rw, bio, mirror_num,
451 __btrfs_submit_bio_hook); 857 bio_flags, __btrfs_submit_bio_hook);
452mapit: 858mapit:
453 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 859 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
454} 860}
@@ -539,7 +945,7 @@ out_page:
539 * good idea. This causes problems because we want to make sure COW 945 * good idea. This causes problems because we want to make sure COW
540 * properly happens and the data=ordered rules are followed. 946 * properly happens and the data=ordered rules are followed.
541 * 947 *
542 * In our case any range that doesn't have the EXTENT_ORDERED bit set 948 * In our case any range that doesn't have the ORDERED bit set
543 * hasn't been properly setup for IO. We kick off an async process 949 * hasn't been properly setup for IO. We kick off an async process
544 * to fix it up. The async helper will wait for ordered extents, set 950 * to fix it up. The async helper will wait for ordered extents, set
545 * the delalloc bit and make it safe to write the page. 951 * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
632 btrfs_set_file_extent_disk_bytenr(leaf, extent_item, 1038 btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
633 ordered_extent->start); 1039 ordered_extent->start);
634 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, 1040 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
635 ordered_extent->len); 1041 ordered_extent->disk_len);
636 btrfs_set_file_extent_offset(leaf, extent_item, 0); 1042 btrfs_set_file_extent_offset(leaf, extent_item, 0);
1043
1044 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1045 btrfs_set_file_extent_compression(leaf, extent_item, 1);
1046 else
1047 btrfs_set_file_extent_compression(leaf, extent_item, 0);
1048 btrfs_set_file_extent_encryption(leaf, extent_item, 0);
1049 btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
1050
1051 /* ram bytes = extent_num_bytes for now */
637 btrfs_set_file_extent_num_bytes(leaf, extent_item, 1052 btrfs_set_file_extent_num_bytes(leaf, extent_item,
638 ordered_extent->len); 1053 ordered_extent->len);
1054 btrfs_set_file_extent_ram_bytes(leaf, extent_item,
1055 ordered_extent->len);
639 btrfs_mark_buffer_dirty(leaf); 1056 btrfs_mark_buffer_dirty(leaf);
640 1057
641 btrfs_drop_extent_cache(inode, ordered_extent->file_offset, 1058 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
644 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1061 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
645 1062
646 ins.objectid = ordered_extent->start; 1063 ins.objectid = ordered_extent->start;
647 ins.offset = ordered_extent->len; 1064 ins.offset = ordered_extent->disk_len;
648 ins.type = BTRFS_EXTENT_ITEM_KEY; 1065 ins.type = BTRFS_EXTENT_ITEM_KEY;
649 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1066 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
650 root->root_key.objectid, 1067 root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
714 int ret; 1131 int ret;
715 int rw; 1132 int rw;
716 u64 logical; 1133 u64 logical;
1134 unsigned long bio_flags = 0;
717 1135
718 ret = get_state_private(failure_tree, start, &private); 1136 ret = get_state_private(failure_tree, start, &private);
719 if (ret) { 1137 if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
738 } 1156 }
739 logical = start - em->start; 1157 logical = start - em->start;
740 logical = em->block_start + logical; 1158 logical = em->block_start + logical;
1159 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1160 bio_flags = EXTENT_BIO_COMPRESSED;
741 failrec->logical = logical; 1161 failrec->logical = logical;
742 free_extent_map(em); 1162 free_extent_map(em);
743 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1163 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
781 rw = READ; 1201 rw = READ;
782 1202
783 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1203 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
784 failrec->last_mirror); 1204 failrec->last_mirror,
1205 bio_flags);
785 return 0; 1206 return 0;
786} 1207}
787 1208
@@ -1644,10 +2065,8 @@ search_again:
1644 item_end += 2065 item_end +=
1645 btrfs_file_extent_num_bytes(leaf, fi); 2066 btrfs_file_extent_num_bytes(leaf, fi);
1646 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2067 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1647 struct btrfs_item *item = btrfs_item_nr(leaf,
1648 path->slots[0]);
1649 item_end += btrfs_file_extent_inline_len(leaf, 2068 item_end += btrfs_file_extent_inline_len(leaf,
1650 item); 2069 fi);
1651 } 2070 }
1652 item_end--; 2071 item_end--;
1653 } 2072 }
@@ -1715,7 +2134,14 @@ search_again:
1715 root_owner = btrfs_header_owner(leaf); 2134 root_owner = btrfs_header_owner(leaf);
1716 } 2135 }
1717 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2136 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1718 if (!del_item) { 2137 /*
2138 * we can't truncate inline items that have had
2139 * special encodings
2140 */
2141 if (!del_item &&
2142 btrfs_file_extent_compression(leaf, fi) == 0 &&
2143 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2144 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
1719 u32 size = new_size - found_key.offset; 2145 u32 size = new_size - found_key.offset;
1720 2146
1721 if (root->ref_cows) { 2147 if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1926 err = btrfs_insert_file_extent(trans, root, 2352 err = btrfs_insert_file_extent(trans, root,
1927 inode->i_ino, 2353 inode->i_ino,
1928 hole_start, 0, 0, 2354 hole_start, 0, 0,
1929 hole_size, 0); 2355 hole_size, 0, hole_size,
2356 0, 0, 0);
1930 btrfs_drop_extent_cache(inode, hole_start, 2357 btrfs_drop_extent_cache(inode, hole_start,
1931 (u64)-1, 0); 2358 (u64)-1, 0);
1932 btrfs_check_file(root, inode); 2359 btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
2894 start_diff = map_start - em->start; 3321 start_diff = map_start - em->start;
2895 em->start = map_start; 3322 em->start = map_start;
2896 em->len = map_len; 3323 em->len = map_len;
2897 if (em->block_start < EXTENT_MAP_LAST_BYTE) 3324 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3325 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2898 em->block_start += start_diff; 3326 em->block_start += start_diff;
3327 em->block_len -= start_diff;
3328 }
2899 return add_extent_mapping(em_tree, em); 3329 return add_extent_mapping(em_tree, em);
2900} 3330}
2901 3331
3332static noinline int uncompress_inline(struct btrfs_path *path,
3333 struct inode *inode, struct page *page,
3334 size_t pg_offset, u64 extent_offset,
3335 struct btrfs_file_extent_item *item)
3336{
3337 int ret;
3338 struct extent_buffer *leaf = path->nodes[0];
3339 char *tmp;
3340 size_t max_size;
3341 unsigned long inline_size;
3342 unsigned long ptr;
3343
3344 WARN_ON(pg_offset != 0);
3345 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3346 inline_size = btrfs_file_extent_inline_item_len(leaf,
3347 btrfs_item_nr(leaf, path->slots[0]));
3348 tmp = kmalloc(inline_size, GFP_NOFS);
3349 ptr = btrfs_file_extent_inline_start(item);
3350
3351 read_extent_buffer(leaf, tmp, ptr, inline_size);
3352
3353 max_size = min(PAGE_CACHE_SIZE, max_size);
3354 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3355 inline_size, max_size);
3356 if (ret) {
3357 char *kaddr = kmap_atomic(page, KM_USER0);
3358 unsigned long copy_size = min_t(u64,
3359 PAGE_CACHE_SIZE - pg_offset,
3360 max_size - extent_offset);
3361 memset(kaddr + pg_offset, 0, copy_size);
3362 kunmap_atomic(kaddr, KM_USER0);
3363 }
3364 kfree(tmp);
3365 return 0;
3366}
3367
2902/* 3368/*
2903 * a bit scary, this does extent mapping from logical file offset to the disk. 3369 * a bit scary, this does extent mapping from logical file offset to the disk.
2904 * the ugly parts come from merging extents from the disk with the 3370 * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2927 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3393 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2928 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3394 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2929 struct btrfs_trans_handle *trans = NULL; 3395 struct btrfs_trans_handle *trans = NULL;
3396 int compressed;
2930 3397
2931again: 3398again:
2932 spin_lock(&em_tree->lock); 3399 spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
2951 em->bdev = root->fs_info->fs_devices->latest_bdev; 3418 em->bdev = root->fs_info->fs_devices->latest_bdev;
2952 em->start = EXTENT_MAP_HOLE; 3419 em->start = EXTENT_MAP_HOLE;
2953 em->len = (u64)-1; 3420 em->len = (u64)-1;
3421 em->block_len = (u64)-1;
2954 3422
2955 if (!path) { 3423 if (!path) {
2956 path = btrfs_alloc_path(); 3424 path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
2983 3451
2984 found_type = btrfs_file_extent_type(leaf, item); 3452 found_type = btrfs_file_extent_type(leaf, item);
2985 extent_start = found_key.offset; 3453 extent_start = found_key.offset;
3454 compressed = btrfs_file_extent_compression(leaf, item);
2986 if (found_type == BTRFS_FILE_EXTENT_REG) { 3455 if (found_type == BTRFS_FILE_EXTENT_REG) {
2987 extent_end = extent_start + 3456 extent_end = extent_start +
2988 btrfs_file_extent_num_bytes(leaf, item); 3457 btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
3005 em->block_start = EXTENT_MAP_HOLE; 3474 em->block_start = EXTENT_MAP_HOLE;
3006 goto insert; 3475 goto insert;
3007 } 3476 }
3008 bytenr += btrfs_file_extent_offset(leaf, item);
3009 em->block_start = bytenr;
3010 em->start = extent_start; 3477 em->start = extent_start;
3011 em->len = extent_end - extent_start; 3478 em->len = extent_end - extent_start;
3479 if (compressed) {
3480 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3481 em->block_start = bytenr;
3482 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
3483 item);
3484 } else {
3485 bytenr += btrfs_file_extent_offset(leaf, item);
3486 em->block_start = bytenr;
3487 em->block_len = em->len;
3488 }
3012 goto insert; 3489 goto insert;
3013 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 3490 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3014 u64 page_start; 3491 u64 page_start;
@@ -3018,8 +3495,7 @@ again:
3018 size_t extent_offset; 3495 size_t extent_offset;
3019 size_t copy_size; 3496 size_t copy_size;
3020 3497
3021 size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, 3498 size = btrfs_file_extent_inline_len(leaf, item);
3022 path->slots[0]));
3023 extent_end = (extent_start + size + root->sectorsize - 1) & 3499 extent_end = (extent_start + size + root->sectorsize - 1) &
3024 ~((u64)root->sectorsize - 1); 3500 ~((u64)root->sectorsize - 1);
3025 if (start < extent_start || start >= extent_end) { 3501 if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
3035 } 3511 }
3036 em->block_start = EXTENT_MAP_INLINE; 3512 em->block_start = EXTENT_MAP_INLINE;
3037 3513
3038 if (!page) { 3514 if (!page || create) {
3039 em->start = extent_start; 3515 em->start = extent_start;
3040 em->len = size; 3516 em->len = (size + root->sectorsize - 1) &
3517 ~((u64)root->sectorsize - 1);
3041 goto out; 3518 goto out;
3042 } 3519 }
3043 3520
@@ -3048,11 +3525,22 @@ again:
3048 em->start = extent_start + extent_offset; 3525 em->start = extent_start + extent_offset;
3049 em->len = (copy_size + root->sectorsize - 1) & 3526 em->len = (copy_size + root->sectorsize - 1) &
3050 ~((u64)root->sectorsize - 1); 3527 ~((u64)root->sectorsize - 1);
3051 map = kmap(page); 3528 if (compressed)
3529 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3052 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 3530 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
3053 if (create == 0 && !PageUptodate(page)) { 3531 if (create == 0 && !PageUptodate(page)) {
3054 read_extent_buffer(leaf, map + pg_offset, ptr, 3532 if (btrfs_file_extent_compression(leaf, item) ==
3055 copy_size); 3533 BTRFS_COMPRESS_ZLIB) {
3534 ret = uncompress_inline(path, inode, page,
3535 pg_offset,
3536 extent_offset, item);
3537 BUG_ON(ret);
3538 } else {
3539 map = kmap(page);
3540 read_extent_buffer(leaf, map + pg_offset, ptr,
3541 copy_size);
3542 kunmap(page);
3543 }
3056 flush_dcache_page(page); 3544 flush_dcache_page(page);
3057 } else if (create && PageUptodate(page)) { 3545 } else if (create && PageUptodate(page)) {
3058 if (!trans) { 3546 if (!trans) {
@@ -3063,11 +3551,12 @@ again:
3063 trans = btrfs_join_transaction(root, 1); 3551 trans = btrfs_join_transaction(root, 1);
3064 goto again; 3552 goto again;
3065 } 3553 }
3554 map = kmap(page);
3066 write_extent_buffer(leaf, map + pg_offset, ptr, 3555 write_extent_buffer(leaf, map + pg_offset, ptr,
3067 copy_size); 3556 copy_size);
3557 kunmap(page);
3068 btrfs_mark_buffer_dirty(leaf); 3558 btrfs_mark_buffer_dirty(leaf);
3069 } 3559 }
3070 kunmap(page);
3071 set_extent_uptodate(io_tree, em->start, 3560 set_extent_uptodate(io_tree, em->start,
3072 extent_map_end(em) - 1, GFP_NOFS); 3561 extent_map_end(em) - 1, GFP_NOFS);
3073 goto insert; 3562 goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
3779 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 4268 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
3780 btrfs_set_file_extent_type(leaf, ei, 4269 btrfs_set_file_extent_type(leaf, ei,
3781 BTRFS_FILE_EXTENT_INLINE); 4270 BTRFS_FILE_EXTENT_INLINE);
4271 btrfs_set_file_extent_encryption(leaf, ei, 0);
4272 btrfs_set_file_extent_compression(leaf, ei, 0);
4273 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4274 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4275
3782 ptr = btrfs_file_extent_inline_start(ei); 4276 ptr = btrfs_file_extent_inline_start(ei);
3783 write_extent_buffer(leaf, symname, ptr, name_len); 4277 write_extent_buffer(leaf, symname, ptr, name_len);
3784 btrfs_mark_buffer_dirty(leaf); 4278 btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c..b5745bb96d4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
165 * inserted. 165 * inserted.
166 */ 166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, int nocow) 168 u64 start, u64 len, u64 disk_len, int nocow,
169 int compressed)
169{ 170{
170 struct btrfs_ordered_inode_tree *tree; 171 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node; 172 struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
180 entry->file_offset = file_offset; 181 entry->file_offset = file_offset;
181 entry->start = start; 182 entry->start = start;
182 entry->len = len; 183 entry->len = len;
184 entry->disk_len = disk_len;
183 entry->inode = inode; 185 entry->inode = inode;
184 if (nocow) 186 if (nocow)
185 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags); 187 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
188 if (compressed)
189 set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
186 190
187 /* one ref for the tree */ 191 /* one ref for the tree */
188 atomic_set(&entry->refs, 1); 192 atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
389 * for pdflush to find them 393 * for pdflush to find them
390 */ 394 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); 395 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
392 if (wait) 396 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 397 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags)); 398 &entry->flags));
399 }
395} 400}
396 401
397/* 402/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a14..1ef464145d2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
66 66
67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
68 68
69#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
70
69struct btrfs_ordered_extent { 71struct btrfs_ordered_extent {
70 /* logical offset in the file */ 72 /* logical offset in the file */
71 u64 file_offset; 73 u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
73 /* disk byte number */ 75 /* disk byte number */
74 u64 start; 76 u64 start;
75 77
76 /* length of the extent in bytes */ 78 /* ram length of the extent in bytes */
77 u64 len; 79 u64 len;
78 80
81 /* extent length on disk */
82 u64 disk_len;
83
79 /* flags (described above) */ 84 /* flags (described above) */
80 unsigned long flags; 85 unsigned long flags;
81 86
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
127int btrfs_dec_test_ordered_pending(struct inode *inode, 132int btrfs_dec_test_ordered_pending(struct inode *inode,
128 u64 file_offset, u64 io_size); 133 u64 file_offset, u64 io_size);
129int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 134int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
130 u64 start, u64 len, int nocow); 135 u64 start, u64 len, u64 disk_len, int nocow,
136 int compressed);
131int btrfs_add_ordered_sum(struct inode *inode, 137int btrfs_add_ordered_sum(struct inode *inode,
132 struct btrfs_ordered_extent *entry, 138 struct btrfs_ordered_extent *entry,
133 struct btrfs_ordered_sum *sum); 139 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f..64725c13aa1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
115 if (btrfs_file_extent_type(l, fi) == 115 if (btrfs_file_extent_type(l, fi) ==
116 BTRFS_FILE_EXTENT_INLINE) { 116 BTRFS_FILE_EXTENT_INLINE) {
117 printk("\t\tinline extent data size %u\n", 117 printk("\t\tinline extent data size %u\n",
118 btrfs_file_extent_inline_len(l, item)); 118 btrfs_file_extent_inline_len(l, fi));
119 break; 119 break;
120 } 120 }
121 printk("\t\textent data disk bytenr %llu nr %llu\n", 121 printk("\t\textent data disk bytenr %llu nr %llu\n",
122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi), 122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi)); 123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
124 printk("\t\textent data offset %llu nr %llu\n", 124 printk("\t\textent data offset %llu nr %llu ram %llu\n",
125 (unsigned long long)btrfs_file_extent_offset(l, fi), 125 (unsigned long long)btrfs_file_extent_offset(l, fi),
126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi)); 126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
127 (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
127 break; 128 break;
128 case BTRFS_BLOCK_GROUP_ITEM_KEY: 129 case BTRFS_BLOCK_GROUP_ITEM_KEY:
129 bi = btrfs_item_ptr(l, i, 130 bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7..431fdf144b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
47#include "volumes.h" 47#include "volumes.h"
48#include "version.h" 48#include "version.h"
49#include "export.h" 49#include "export.h"
50#include "compression.h"
50 51
51#define BTRFS_SUPER_MAGIC 0x9123683E 52#define BTRFS_SUPER_MAGIC 0x9123683E
52 53
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
69enum { 70enum {
70 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 71 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
71 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 72 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
72 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err, 73 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
73}; 74};
74 75
75static match_table_t tokens = { 76static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
83 {Opt_max_inline, "max_inline=%s"}, 84 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"}, 85 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"}, 86 {Opt_thread_pool, "thread_pool=%d"},
87 {Opt_compress, "compress"},
86 {Opt_ssd, "ssd"}, 88 {Opt_ssd, "ssd"},
87 {Opt_noacl, "noacl"}, 89 {Opt_noacl, "noacl"},
88 {Opt_err, NULL}, 90 {Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
163 btrfs_set_opt(info->mount_opt, NODATACOW); 165 btrfs_set_opt(info->mount_opt, NODATACOW);
164 btrfs_set_opt(info->mount_opt, NODATASUM); 166 btrfs_set_opt(info->mount_opt, NODATASUM);
165 break; 167 break;
168 case Opt_compress:
169 printk(KERN_INFO "btrfs: use compression\n");
170 btrfs_set_opt(info->mount_opt, COMPRESS);
171 break;
166 case Opt_ssd: 172 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 173 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
168 btrfs_set_opt(info->mount_opt, SSD); 174 btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
622 err = btrfs_interface_init(); 628 err = btrfs_interface_init();
623 if (err) 629 if (err)
624 goto free_extent_map; 630 goto free_extent_map;
631
625 err = register_filesystem(&btrfs_fs_type); 632 err = register_filesystem(&btrfs_fs_type);
626 if (err) 633 if (err)
627 goto unregister_ioctl; 634 goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
651 unregister_filesystem(&btrfs_fs_type); 658 unregister_filesystem(&btrfs_fs_type);
652 btrfs_exit_sysfs(); 659 btrfs_exit_sysfs();
653 btrfs_cleanup_fs_uuids(); 660 btrfs_cleanup_fs_uuids();
661 btrfs_zlib_exit();
654} 662}
655 663
656module_init(init_btrfs_fs) 664module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34..e6d579053a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
540 if (found_type == BTRFS_FILE_EXTENT_REG) 540 if (found_type == BTRFS_FILE_EXTENT_REG)
541 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 541 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
542 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 542 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
543 size = btrfs_file_extent_inline_len(eb, 543 size = btrfs_file_extent_inline_len(eb, item);
544 btrfs_item_nr(eb, slot));
545 extent_end = (start + size + mask) & ~mask; 544 extent_end = (start + size + mask) & ~mask;
546 } else { 545 } else {
547 ret = 0; 546 ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51..7db4cfd03a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
1816 em->start = key.offset; 1816 em->start = key.offset;
1817 em->len = *num_bytes; 1817 em->len = *num_bytes;
1818 em->block_start = 0; 1818 em->block_start = 0;
1819 em->block_len = em->len;
1819 1820
1820 if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 1821 if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1821 ret = btrfs_add_system_chunk(trans, chunk_root, &key, 1822 ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2323 em->start = logical; 2324 em->start = logical;
2324 em->len = length; 2325 em->len = length;
2325 em->block_start = 0; 2326 em->block_start = 0;
2327 em->block_len = em->len;
2326 2328
2327 map->num_stripes = num_stripes; 2329 map->num_stripes = num_stripes;
2328 map->io_width = btrfs_chunk_io_width(leaf, chunk); 2330 map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..e99309180a1
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33
34/* Plan: call deflate() with avail_in == *sourcelen,
35 avail_out = *dstlen - 12 and flush == Z_FINISH.
36 If it doesn't manage to finish, call it again with
37 avail_in == 0 and avail_out set to the remaining 12
38 bytes for it to clean up.
39 Q: Is 12 bytes sufficient?
40*/
41#define STREAM_END_SPACE 12
42
43struct workspace {
44 z_stream inf_strm;
45 z_stream def_strm;
46 char *buf;
47 struct list_head list;
48};
49
50static LIST_HEAD(idle_workspace);
51static DEFINE_SPINLOCK(workspace_lock);
52static unsigned long num_workspace;
53static atomic_t alloc_workspace = ATOMIC_INIT(0);
54static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
55
56/*
57 * this finds an available zlib workspace or allocates a new one
58 * NULL or an ERR_PTR is returned if things go bad.
59 */
60static struct workspace *find_zlib_workspace(void)
61{
62 struct workspace *workspace;
63 int ret;
64 int cpus = num_online_cpus();
65
66again:
67 spin_lock(&workspace_lock);
68 if (!list_empty(&idle_workspace)) {
69 workspace = list_entry(idle_workspace.next, struct workspace,
70 list);
71 list_del(&workspace->list);
72 num_workspace--;
73 spin_unlock(&workspace_lock);
74 return workspace;
75
76 }
77 spin_unlock(&workspace_lock);
78 if (atomic_read(&alloc_workspace) > cpus) {
79 DEFINE_WAIT(wait);
80 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
81 if (atomic_read(&alloc_workspace) > cpus)
82 schedule();
83 finish_wait(&workspace_wait, &wait);
84 goto again;
85 }
86 atomic_inc(&alloc_workspace);
87 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
88 if (!workspace) {
89 ret = -ENOMEM;
90 goto fail;
91 }
92
93 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
94 if (!workspace->def_strm.workspace) {
95 ret = -ENOMEM;
96 goto fail;
97 }
98 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
99 if (!workspace->inf_strm.workspace) {
100 ret = -ENOMEM;
101 goto fail_inflate;
102 }
103 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
104 if (!workspace->buf) {
105 ret = -ENOMEM;
106 goto fail_kmalloc;
107 }
108 return workspace;
109
110fail_kmalloc:
111 vfree(workspace->inf_strm.workspace);
112fail_inflate:
113 vfree(workspace->def_strm.workspace);
114fail:
115 kfree(workspace);
116 atomic_dec(&alloc_workspace);
117 wake_up(&workspace_wait);
118 return ERR_PTR(ret);
119}
120
121/*
122 * put a workspace struct back on the list or free it if we have enough
123 * idle ones sitting around
124 */
125static int free_workspace(struct workspace *workspace)
126{
127 spin_lock(&workspace_lock);
128 if (num_workspace < num_online_cpus()) {
129 list_add_tail(&workspace->list, &idle_workspace);
130 num_workspace++;
131 spin_unlock(&workspace_lock);
132 if (waitqueue_active(&workspace_wait))
133 wake_up(&workspace_wait);
134 return 0;
135 }
136 spin_unlock(&workspace_lock);
137 vfree(workspace->def_strm.workspace);
138 vfree(workspace->inf_strm.workspace);
139 kfree(workspace->buf);
140 kfree(workspace);
141
142 atomic_dec(&alloc_workspace);
143 if (waitqueue_active(&workspace_wait))
144 wake_up(&workspace_wait);
145 return 0;
146}
147
148/*
149 * cleanup function for module exit
150 */
151static void free_workspaces(void)
152{
153 struct workspace *workspace;
154 while(!list_empty(&idle_workspace)) {
155 workspace = list_entry(idle_workspace.next, struct workspace,
156 list);
157 list_del(&workspace->list);
158 vfree(workspace->def_strm.workspace);
159 vfree(workspace->inf_strm.workspace);
160 kfree(workspace->buf);
161 kfree(workspace);
162 atomic_dec(&alloc_workspace);
163 }
164}
165
166/*
167 * given an address space and start/len, compress the bytes.
168 *
169 * pages are allocated to hold the compressed result and stored
170 * in 'pages'
171 *
172 * out_pages is used to return the number of pages allocated. There
173 * may be pages allocated even if we return an error
174 *
175 * total_in is used to return the number of bytes actually read. It
176 * may be smaller then len if we had to exit early because we
177 * ran out of room in the pages array or because we cross the
178 * max_out threshold.
179 *
180 * total_out is used to return the total number of compressed bytes
181 *
182 * max_out tells us the max number of bytes that we're allowed to
183 * stuff into pages
184 */
185int btrfs_zlib_compress_pages(struct address_space *mapping,
186 u64 start, unsigned long len,
187 struct page **pages,
188 unsigned long nr_dest_pages,
189 unsigned long *out_pages,
190 unsigned long *total_in,
191 unsigned long *total_out,
192 unsigned long max_out)
193{
194 int ret;
195 struct workspace *workspace;
196 char *data_in;
197 char *cpage_out;
198 int nr_pages = 0;
199 struct page *in_page = NULL;
200 struct page *out_page = NULL;
201 int out_written = 0;
202 int in_read = 0;
203 unsigned long bytes_left;
204
205 *out_pages = 0;
206 *total_out = 0;
207 *total_in = 0;
208
209 workspace = find_zlib_workspace();
210 if (!workspace)
211 return -1;
212
213 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
214 printk(KERN_WARNING "deflateInit failed\n");
215 ret = -1;
216 goto out;
217 }
218
219 workspace->def_strm.total_in = 0;
220 workspace->def_strm.total_out = 0;
221
222 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
223 data_in = kmap(in_page);
224
225 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
226 cpage_out = kmap(out_page);
227 pages[0] = out_page;
228 nr_pages = 1;
229
230 workspace->def_strm.next_in = data_in;
231 workspace->def_strm.next_out = cpage_out;
232 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
233 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
234
235 out_written = 0;
236 in_read = 0;
237
238 while (workspace->def_strm.total_in < len) {
239 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
240 if (ret != Z_OK) {
241 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
242 ret);
243 zlib_deflateEnd(&workspace->def_strm);
244 ret = -1;
245 goto out;
246 }
247
248 /* we're making it bigger, give up */
249 if (workspace->def_strm.total_in > 8192 &&
250 workspace->def_strm.total_in <
251 workspace->def_strm.total_out) {
252 ret = -1;
253 goto out;
254 }
255 /* we need another page for writing out. Test this
256 * before the total_in so we will pull in a new page for
257 * the stream end if required
258 */
259 if (workspace->def_strm.avail_out == 0) {
260 kunmap(out_page);
261 if (nr_pages == nr_dest_pages) {
262 out_page = NULL;
263 ret = -1;
264 goto out;
265 }
266 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
267 cpage_out = kmap(out_page);
268 pages[nr_pages] = out_page;
269 nr_pages++;
270 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
271 workspace->def_strm.next_out = cpage_out;
272 }
273 /* we're all done */
274 if (workspace->def_strm.total_in >= len)
275 break;
276
277 /* we've read in a full page, get a new one */
278 if (workspace->def_strm.avail_in == 0) {
279 if (workspace->def_strm.total_out > max_out)
280 break;
281
282 bytes_left = len - workspace->def_strm.total_in;
283 kunmap(in_page);
284 page_cache_release(in_page);
285
286 start += PAGE_CACHE_SIZE;
287 in_page = find_get_page(mapping,
288 start >> PAGE_CACHE_SHIFT);
289 data_in = kmap(in_page);
290 workspace->def_strm.avail_in = min(bytes_left,
291 PAGE_CACHE_SIZE);
292 workspace->def_strm.next_in = data_in;
293 }
294 }
295 workspace->def_strm.avail_in = 0;
296 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
297 zlib_deflateEnd(&workspace->def_strm);
298
299 if (ret != Z_STREAM_END) {
300 ret = -1;
301 goto out;
302 }
303
304 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
305 ret = -1;
306 goto out;
307 }
308
309 ret = 0;
310 *total_out = workspace->def_strm.total_out;
311 *total_in = workspace->def_strm.total_in;
312out:
313 *out_pages = nr_pages;
314 if (out_page)
315 kunmap(out_page);
316
317 if (in_page) {
318 kunmap(in_page);
319 page_cache_release(in_page);
320 }
321 free_workspace(workspace);
322 return ret;
323}
324
325/*
326 * pages_in is an array of pages with compressed data.
327 *
328 * disk_start is the starting logical offset of this array in the file
329 *
330 * bvec is a bio_vec of pages from the file that we want to decompress into
331 *
332 * vcnt is the count of pages in the biovec
333 *
334 * srclen is the number of bytes in pages_in
335 *
336 * The basic idea is that we have a bio that was created by readpages.
337 * The pages in the bio are for the uncompressed data, and they may not
338 * be contiguous. They all correspond to the range of bytes covered by
339 * the compressed extent.
340 */
341int btrfs_zlib_decompress_biovec(struct page **pages_in,
342 u64 disk_start,
343 struct bio_vec *bvec,
344 int vcnt,
345 size_t srclen)
346{
347 int ret = 0;
348 int wbits = MAX_WBITS;
349 struct workspace *workspace;
350 char *data_in;
351 size_t total_out = 0;
352 unsigned long page_bytes_left;
353 unsigned long page_in_index = 0;
354 unsigned long page_out_index = 0;
355 struct page *page_out;
356 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
357 PAGE_CACHE_SIZE;
358 unsigned long buf_start;
359 unsigned long buf_offset;
360 unsigned long bytes;
361 unsigned long working_bytes;
362 unsigned long pg_offset;
363 unsigned long start_byte;
364 unsigned long current_buf_start;
365 char *kaddr;
366
367 workspace = find_zlib_workspace();
368 if (!workspace)
369 return -ENOMEM;
370
371 data_in = kmap(pages_in[page_in_index]);
372 workspace->inf_strm.next_in = data_in;
373 workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
374 workspace->inf_strm.total_in = 0;
375
376 workspace->inf_strm.total_out = 0;
377 workspace->inf_strm.next_out = workspace->buf;
378 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
379 page_out = bvec[page_out_index].bv_page;
380 page_bytes_left = PAGE_CACHE_SIZE;
381 pg_offset = 0;
382
383 /* If it's deflate, and it's got no preset dictionary, then
384 we can tell zlib to skip the adler32 check. */
385 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
386 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
387 !(((data_in[0]<<8) + data_in[1]) % 31)) {
388
389 wbits = -((data_in[0] >> 4) + 8);
390 workspace->inf_strm.next_in += 2;
391 workspace->inf_strm.avail_in -= 2;
392 }
393
394 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
395 printk(KERN_WARNING "inflateInit failed\n");
396 ret = -1;
397 goto out;
398 }
399 while(workspace->inf_strm.total_in < srclen) {
400 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
401 if (ret != Z_OK && ret != Z_STREAM_END) {
402 break;
403 }
404
405 /*
406 * buf start is the byte offset we're of the start of
407 * our workspace buffer
408 */
409 buf_start = total_out;
410
411 /* total_out is the last byte of the workspace buffer */
412 total_out = workspace->inf_strm.total_out;
413
414 working_bytes = total_out - buf_start;
415
416 /*
417 * start byte is the first byte of the page we're currently
418 * copying into relative to the start of the compressed data.
419 */
420 start_byte = page_offset(page_out) - disk_start;
421
422 if (working_bytes == 0) {
423 /* we didn't make progress in this inflate
424 * call, we're done
425 */
426 if (ret != Z_STREAM_END)
427 ret = -1;
428 break;
429 }
430
431 /* we haven't yet hit data corresponding to this page */
432 if (total_out <= start_byte) {
433 goto next;
434 }
435
436 /*
437 * the start of the data we care about is offset into
438 * the middle of our working buffer
439 */
440 if (total_out > start_byte && buf_start < start_byte) {
441 buf_offset = start_byte - buf_start;
442 working_bytes -= buf_offset;
443 } else {
444 buf_offset = 0;
445 }
446 current_buf_start = buf_start;
447
448 /* copy bytes from the working buffer into the pages */
449 while(working_bytes > 0) {
450 bytes = min(PAGE_CACHE_SIZE - pg_offset,
451 PAGE_CACHE_SIZE - buf_offset);
452 bytes = min(bytes, working_bytes);
453 kaddr = kmap_atomic(page_out, KM_USER0);
454 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
455 bytes);
456 kunmap_atomic(kaddr, KM_USER0);
457 flush_dcache_page(page_out);
458
459 pg_offset += bytes;
460 page_bytes_left -= bytes;
461 buf_offset += bytes;
462 working_bytes -= bytes;
463 current_buf_start += bytes;
464
465 /* check if we need to pick another page */
466 if (page_bytes_left == 0) {
467 page_out_index++;
468 if (page_out_index >= vcnt) {
469 ret = 0;
470 goto done;
471 }
472 page_out = bvec[page_out_index].bv_page;
473 pg_offset = 0;
474 page_bytes_left = PAGE_CACHE_SIZE;
475 start_byte = page_offset(page_out) - disk_start;
476
477 /*
478 * make sure our new page is covered by this
479 * working buffer
480 */
481 if (total_out <= start_byte) {
482 goto next;
483 }
484
485 /* the next page in the biovec might not
486 * be adjacent to the last page, but it
487 * might still be found inside this working
488 * buffer. bump our offset pointer
489 */
490 if (total_out > start_byte &&
491 current_buf_start < start_byte) {
492 buf_offset = start_byte - buf_start;
493 working_bytes = total_out - start_byte;
494 current_buf_start = buf_start +
495 buf_offset;
496 }
497 }
498 }
499next:
500 workspace->inf_strm.next_out = workspace->buf;
501 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
502
503 if (workspace->inf_strm.avail_in == 0) {
504 unsigned long tmp;
505 kunmap(pages_in[page_in_index]);
506 page_in_index++;
507 if (page_in_index >= total_pages_in) {
508 data_in = NULL;
509 break;
510 }
511 data_in = kmap(pages_in[page_in_index]);
512 workspace->inf_strm.next_in = data_in;
513 tmp = srclen - workspace->inf_strm.total_in;
514 workspace->inf_strm.avail_in = min(tmp,
515 PAGE_CACHE_SIZE);
516 }
517 }
518 if (ret != Z_STREAM_END) {
519 ret = -1;
520 } else {
521 ret = 0;
522 }
523done:
524 zlib_inflateEnd(&workspace->inf_strm);
525 if (data_in)
526 kunmap(pages_in[page_in_index]);
527out:
528 free_workspace(workspace);
529 return ret;
530}
531
532/*
533 * a less complex decompression routine. Our compressed data fits in a
534 * single page, and we want to read a single page out of it.
535 * start_byte tells us the offset into the compressed data we're interested in
536 */
537int btrfs_zlib_decompress(unsigned char *data_in,
538 struct page *dest_page,
539 unsigned long start_byte,
540 size_t srclen, size_t destlen)
541{
542 int ret = 0;
543 int wbits = MAX_WBITS;
544 struct workspace *workspace;
545 unsigned long bytes_left = destlen;
546 unsigned long total_out = 0;
547 char *kaddr;
548
549 if (destlen > PAGE_CACHE_SIZE)
550 return -ENOMEM;
551
552 workspace = find_zlib_workspace();
553 if (!workspace)
554 return -ENOMEM;
555
556 workspace->inf_strm.next_in = data_in;
557 workspace->inf_strm.avail_in = srclen;
558 workspace->inf_strm.total_in = 0;
559
560 workspace->inf_strm.next_out = workspace->buf;
561 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
562 workspace->inf_strm.total_out = 0;
563 /* If it's deflate, and it's got no preset dictionary, then
564 we can tell zlib to skip the adler32 check. */
565 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
566 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
567 !(((data_in[0]<<8) + data_in[1]) % 31)) {
568
569 wbits = -((data_in[0] >> 4) + 8);
570 workspace->inf_strm.next_in += 2;
571 workspace->inf_strm.avail_in -= 2;
572 }
573
574 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
575 printk(KERN_WARNING "inflateInit failed\n");
576 ret = -1;
577 goto out;
578 }
579
580 while(bytes_left > 0) {
581 unsigned long buf_start;
582 unsigned long buf_offset;
583 unsigned long bytes;
584 unsigned long pg_offset = 0;
585
586 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
587 if (ret != Z_OK && ret != Z_STREAM_END) {
588 break;
589 }
590
591 buf_start = total_out;
592 total_out = workspace->inf_strm.total_out;
593
594 if (total_out == buf_start) {
595 ret = -1;
596 break;
597 }
598
599 if (total_out <= start_byte) {
600 goto next;
601 }
602
603 if (total_out > start_byte && buf_start < start_byte) {
604 buf_offset = start_byte - buf_start;
605 } else {
606 buf_offset = 0;
607 }
608
609 bytes = min(PAGE_CACHE_SIZE - pg_offset,
610 PAGE_CACHE_SIZE - buf_offset);
611 bytes = min(bytes, bytes_left);
612
613 kaddr = kmap_atomic(dest_page, KM_USER0);
614 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
615 kunmap_atomic(kaddr, KM_USER0);
616
617 pg_offset += bytes;
618 bytes_left -= bytes;
619next:
620 workspace->inf_strm.next_out = workspace->buf;
621 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
622 }
623 if (ret != Z_STREAM_END && bytes_left != 0) {
624 ret = -1;
625 } else {
626 ret = 0;
627 }
628 zlib_inflateEnd(&workspace->inf_strm);
629out:
630 free_workspace(workspace);
631 return ret;
632}
633
634void btrfs_zlib_exit(void)
635{
636 free_workspaces();
637}