aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/compression.c454
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/ctree.h99
-rw-r--r--fs/btrfs/disk-io.c18
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c411
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c9
-rw-r--r--fs/btrfs/extent_map.h6
-rw-r--r--fs/btrfs/file-item.c75
-rw-r--r--fs/btrfs/file.c263
-rw-r--r--fs/btrfs/inode.c584
-rw-r--r--fs/btrfs/ordered-data.c9
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/zlib.c637
21 files changed, 2313 insertions, 379 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142b..d2cf5a54a4b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 transaction.o inode.o file.o tree-defrag.o \ 7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
11else 12else
12 13
13# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..c5470367ca5c
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include "ctree.h"
37#include "disk-io.h"
38#include "transaction.h"
39#include "btrfs_inode.h"
40#include "volumes.h"
41#include "ordered-data.h"
42#include "compat.h"
43#include "compression.h"
44#include "extent_io.h"
45#include "extent_map.h"
46
47struct compressed_bio {
48 /* number of bios pending for this compressed extent */
49 atomic_t pending_bios;
50
51 /* the pages with the compressed data on them */
52 struct page **compressed_pages;
53
54 /* inode that owns this data */
55 struct inode *inode;
56
57 /* starting offset in the inode for our pages */
58 u64 start;
59
60 /* number of bytes in the inode we're working on */
61 unsigned long len;
62
63 /* number of bytes on disk */
64 unsigned long compressed_len;
65
66 /* number of compressed pages in the array */
67 unsigned long nr_pages;
68
69 /* IO errors */
70 int errors;
71
72 /* for reads, this is the bio we are copying the data into */
73 struct bio *orig_bio;
74};
75
76static struct bio *compressed_bio_alloc(struct block_device *bdev,
77 u64 first_byte, gfp_t gfp_flags)
78{
79 struct bio *bio;
80 int nr_vecs;
81
82 nr_vecs = bio_get_nr_vecs(bdev);
83 bio = bio_alloc(gfp_flags, nr_vecs);
84
85 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
86 while (!bio && (nr_vecs /= 2))
87 bio = bio_alloc(gfp_flags, nr_vecs);
88 }
89
90 if (bio) {
91 bio->bi_size = 0;
92 bio->bi_bdev = bdev;
93 bio->bi_sector = first_byte >> 9;
94 }
95 return bio;
96}
97
98/* when we finish reading compressed pages from the disk, we
99 * decompress them and then run the bio end_io routines on the
100 * decompressed pages (in the inode address space).
101 *
102 * This allows the checksumming and other IO error handling routines
103 * to work normally
104 *
105 * The compressed pages are freed here, and it must be run
106 * in process context
107 */
108static void end_compressed_bio_read(struct bio *bio, int err)
109{
110 struct extent_io_tree *tree;
111 struct compressed_bio *cb = bio->bi_private;
112 struct inode *inode;
113 struct page *page;
114 unsigned long index;
115 int ret;
116
117 if (err)
118 cb->errors = 1;
119
120 /* if there are more bios still pending for this compressed
121 * extent, just exit
122 */
123 if (!atomic_dec_and_test(&cb->pending_bios))
124 goto out;
125
126 /* ok, we're the last bio for this extent, lets start
127 * the decompression.
128 */
129 inode = cb->inode;
130 tree = &BTRFS_I(inode)->io_tree;
131 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
132 cb->start,
133 cb->orig_bio->bi_io_vec,
134 cb->orig_bio->bi_vcnt,
135 cb->compressed_len);
136 if (ret)
137 cb->errors = 1;
138
139 /* release the compressed pages */
140 index = 0;
141 for (index = 0; index < cb->nr_pages; index++) {
142 page = cb->compressed_pages[index];
143 page->mapping = NULL;
144 page_cache_release(page);
145 }
146
147 /* do io completion on the original bio */
148 if (cb->errors)
149 bio_io_error(cb->orig_bio);
150 else
151 bio_endio(cb->orig_bio, 0);
152
153 /* finally free the cb struct */
154 kfree(cb->compressed_pages);
155 kfree(cb);
156out:
157 bio_put(bio);
158}
159
160/*
161 * Clear the writeback bits on all of the file
162 * pages for a compressed write
163 */
164static noinline int end_compressed_writeback(struct inode *inode, u64 start,
165 unsigned long ram_size)
166{
167 unsigned long index = start >> PAGE_CACHE_SHIFT;
168 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
169 struct page *pages[16];
170 unsigned long nr_pages = end_index - index + 1;
171 int i;
172 int ret;
173
174 while(nr_pages > 0) {
175 ret = find_get_pages_contig(inode->i_mapping, index,
176 min(nr_pages, ARRAY_SIZE(pages)), pages);
177 if (ret == 0) {
178 nr_pages -= 1;
179 index += 1;
180 continue;
181 }
182 for (i = 0; i < ret; i++) {
183 end_page_writeback(pages[i]);
184 page_cache_release(pages[i]);
185 }
186 nr_pages -= ret;
187 index += ret;
188 }
189 /* the inode may be gone now */
190 return 0;
191}
192
193/*
194 * do the cleanup once all the compressed pages hit the disk.
195 * This will clear writeback on the file pages and free the compressed
196 * pages.
197 *
198 * This also calls the writeback end hooks for the file pages so that
199 * metadata and checksums can be updated in the file.
200 */
201static void end_compressed_bio_write(struct bio *bio, int err)
202{
203 struct extent_io_tree *tree;
204 struct compressed_bio *cb = bio->bi_private;
205 struct inode *inode;
206 struct page *page;
207 unsigned long index;
208
209 if (err)
210 cb->errors = 1;
211
212 /* if there are more bios still pending for this compressed
213 * extent, just exit
214 */
215 if (!atomic_dec_and_test(&cb->pending_bios))
216 goto out;
217
218 /* ok, we're the last bio for this extent, step one is to
219 * call back into the FS and do all the end_io operations
220 */
221 inode = cb->inode;
222 tree = &BTRFS_I(inode)->io_tree;
223 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
224 cb->start,
225 cb->start + cb->len - 1,
226 NULL, 1);
227
228 end_compressed_writeback(inode, cb->start, cb->len);
229 /* note, our inode could be gone now */
230
231 /*
232 * release the compressed pages, these came from alloc_page and
233 * are not attached to the inode at all
234 */
235 index = 0;
236 for (index = 0; index < cb->nr_pages; index++) {
237 page = cb->compressed_pages[index];
238 page->mapping = NULL;
239 page_cache_release(page);
240 }
241
242 /* finally free the cb struct */
243 kfree(cb->compressed_pages);
244 kfree(cb);
245out:
246 bio_put(bio);
247}
248
249/*
250 * worker function to build and submit bios for previously compressed pages.
251 * The corresponding pages in the inode should be marked for writeback
252 * and the compressed pages should have a reference on them for dropping
253 * when the IO is complete.
254 *
255 * This also checksums the file bytes and gets things ready for
256 * the end io hooks.
257 */
258int btrfs_submit_compressed_write(struct inode *inode, u64 start,
259 unsigned long len, u64 disk_start,
260 unsigned long compressed_len,
261 struct page **compressed_pages,
262 unsigned long nr_pages)
263{
264 struct bio *bio = NULL;
265 struct btrfs_root *root = BTRFS_I(inode)->root;
266 struct compressed_bio *cb;
267 unsigned long bytes_left;
268 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
269 int page_index = 0;
270 struct page *page;
271 u64 first_byte = disk_start;
272 struct block_device *bdev;
273 int ret;
274
275 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
276 cb = kmalloc(sizeof(*cb), GFP_NOFS);
277 atomic_set(&cb->pending_bios, 0);
278 cb->errors = 0;
279 cb->inode = inode;
280 cb->start = start;
281 cb->len = len;
282 cb->compressed_pages = compressed_pages;
283 cb->compressed_len = compressed_len;
284 cb->orig_bio = NULL;
285 cb->nr_pages = nr_pages;
286
287 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
288
289 ret = btrfs_csum_file_bytes(root, inode, start, len);
290 BUG_ON(ret);
291
292 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
293 bio->bi_private = cb;
294 bio->bi_end_io = end_compressed_bio_write;
295 atomic_inc(&cb->pending_bios);
296
297 /* create and submit bios for the compressed pages */
298 bytes_left = compressed_len;
299 while(bytes_left > 0) {
300 page = compressed_pages[page_index];
301 page->mapping = inode->i_mapping;
302 if (bio->bi_size)
303 ret = io_tree->ops->merge_bio_hook(page, 0,
304 PAGE_CACHE_SIZE,
305 bio, 0);
306 else
307 ret = 0;
308
309 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
310 PAGE_CACHE_SIZE) {
311 bio_get(bio);
312
313 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
314 BUG_ON(ret);
315
316 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
317 BUG_ON(ret);
318
319 bio_put(bio);
320
321 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
322 atomic_inc(&cb->pending_bios);
323 bio->bi_private = cb;
324 bio->bi_end_io = end_compressed_bio_write;
325 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
326 }
327 page_index++;
328 bytes_left -= PAGE_CACHE_SIZE;
329 first_byte += PAGE_CACHE_SIZE;
330 }
331 bio_get(bio);
332
333 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
334 BUG_ON(ret);
335
336 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
337 BUG_ON(ret);
338
339 bio_put(bio);
340 return 0;
341}
342
343/*
344 * for a compressed read, the bio we get passed has all the inode pages
345 * in it. We don't actually do IO on those pages but allocate new ones
346 * to hold the compressed pages on disk.
347 *
348 * bio->bi_sector points to the compressed extent on disk
349 * bio->bi_io_vec points to all of the inode pages
350 * bio->bi_vcnt is a count of pages
351 *
352 * After the compressed pages are read, we copy the bytes into the
353 * bio we were passed and then call the bio end_io calls
354 */
355int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
356 int mirror_num, unsigned long bio_flags)
357{
358 struct extent_io_tree *tree;
359 struct extent_map_tree *em_tree;
360 struct compressed_bio *cb;
361 struct btrfs_root *root = BTRFS_I(inode)->root;
362 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
363 unsigned long compressed_len;
364 unsigned long nr_pages;
365 unsigned long page_index;
366 struct page *page;
367 struct block_device *bdev;
368 struct bio *comp_bio;
369 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
370 struct extent_map *em;
371 int ret;
372
373 tree = &BTRFS_I(inode)->io_tree;
374 em_tree = &BTRFS_I(inode)->extent_tree;
375
376 /* we need the actual starting offset of this extent in the file */
377 spin_lock(&em_tree->lock);
378 em = lookup_extent_mapping(em_tree,
379 page_offset(bio->bi_io_vec->bv_page),
380 PAGE_CACHE_SIZE);
381 spin_unlock(&em_tree->lock);
382
383 cb = kmalloc(sizeof(*cb), GFP_NOFS);
384 atomic_set(&cb->pending_bios, 0);
385 cb->errors = 0;
386 cb->inode = inode;
387
388 cb->start = em->start;
389 compressed_len = em->block_len;
390 free_extent_map(em);
391
392 cb->len = uncompressed_len;
393 cb->compressed_len = compressed_len;
394 cb->orig_bio = bio;
395
396 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
397 PAGE_CACHE_SIZE;
398 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
399 GFP_NOFS);
400 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
401
402 for (page_index = 0; page_index < nr_pages; page_index++) {
403 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
404 __GFP_HIGHMEM);
405 }
406 cb->nr_pages = nr_pages;
407
408 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
409 comp_bio->bi_private = cb;
410 comp_bio->bi_end_io = end_compressed_bio_read;
411 atomic_inc(&cb->pending_bios);
412
413 for (page_index = 0; page_index < nr_pages; page_index++) {
414 page = cb->compressed_pages[page_index];
415 page->mapping = inode->i_mapping;
416 if (comp_bio->bi_size)
417 ret = tree->ops->merge_bio_hook(page, 0,
418 PAGE_CACHE_SIZE,
419 comp_bio, 0);
420 else
421 ret = 0;
422
423 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
424 PAGE_CACHE_SIZE) {
425 bio_get(comp_bio);
426
427 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
431 BUG_ON(ret);
432
433 bio_put(comp_bio);
434
435 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
436 GFP_NOFS);
437 atomic_inc(&cb->pending_bios);
438 bio->bi_private = cb;
439 bio->bi_end_io = end_compressed_bio_write;
440 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
441 }
442 cur_disk_byte += PAGE_CACHE_SIZE;
443 }
444 bio_get(comp_bio);
445
446 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
447 BUG_ON(ret);
448
449 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
450 BUG_ON(ret);
451
452 bio_put(comp_bio);
453 return 0;
454}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47f..793d8fdda244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
400 __le32 nsec; 400 __le32 nsec;
401} __attribute__ ((__packed__)); 401} __attribute__ ((__packed__));
402 402
403/* 403typedef enum {
404 * there is no padding here on purpose. If you want to extent the inode, 404 BTRFS_COMPRESS_NONE = 0,
405 * make a new item type 405 BTRFS_COMPRESS_ZLIB = 1,
406 */ 406 BTRFS_COMPRESS_LAST = 2,
407} btrfs_compression_type;
408
409/* we don't understand any encryption methods right now */
410typedef enum {
411 BTRFS_ENCRYPTION_NONE = 0,
412 BTRFS_ENCRYPTION_LAST = 1,
413} btrfs_encryption_type;
414
407struct btrfs_inode_item { 415struct btrfs_inode_item {
408 /* nfs style generation number */ 416 /* nfs style generation number */
409 __le64 generation; 417 __le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
419 __le64 rdev; 427 __le64 rdev;
420 __le16 flags; 428 __le16 flags;
421 __le16 compat_flags; 429 __le16 compat_flags;
430
422 struct btrfs_timespec atime; 431 struct btrfs_timespec atime;
423 struct btrfs_timespec ctime; 432 struct btrfs_timespec ctime;
424 struct btrfs_timespec mtime; 433 struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
454#define BTRFS_FILE_EXTENT_INLINE 1 463#define BTRFS_FILE_EXTENT_INLINE 1
455 464
456struct btrfs_file_extent_item { 465struct btrfs_file_extent_item {
466 /*
467 * transaction id that created this extent
468 */
457 __le64 generation; 469 __le64 generation;
470 /*
471 * max number of bytes to hold this extent in ram
472 * when we split a compressed extent we can't know how big
473 * each of the resulting pieces will be. So, this is
474 * an upper limit on the size of the extent in ram instead of
475 * an exact limit.
476 */
477 __le64 ram_bytes;
478
479 /*
480 * 32 bits for the various ways we might encode the data,
481 * including compression and encryption. If any of these
482 * are set to something a given disk format doesn't understand
483 * it is treated like an incompat flag for reading and writing,
484 * but not for stat.
485 */
486 u8 compression;
487 u8 encryption;
488 __le16 other_encoding; /* spare for later use */
489
490 /* are we inline data or a real extent? */
458 u8 type; 491 u8 type;
492
459 /* 493 /*
460 * disk space consumed by the extent, checksum blocks are included 494 * disk space consumed by the extent, checksum blocks are included
461 * in these numbers 495 * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
471 */ 505 */
472 __le64 offset; 506 __le64 offset;
473 /* 507 /*
474 * the logical number of file blocks (no csums included) 508 * the logical number of file blocks (no csums included). This
509 * always reflects the size uncompressed and without encoding.
475 */ 510 */
476 __le64 num_bytes; 511 __le64 num_bytes;
512
477} __attribute__ ((__packed__)); 513} __attribute__ ((__packed__));
478 514
479struct btrfs_csum_item { 515struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
814#define BTRFS_MOUNT_NOBARRIER (1 << 2) 850#define BTRFS_MOUNT_NOBARRIER (1 << 2)
815#define BTRFS_MOUNT_SSD (1 << 3) 851#define BTRFS_MOUNT_SSD (1 << 3)
816#define BTRFS_MOUNT_DEGRADED (1 << 4) 852#define BTRFS_MOUNT_DEGRADED (1 << 4)
853#define BTRFS_MOUNT_COMPRESS (1 << 5)
817 854
818#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 855#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
819#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 856#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
825#define BTRFS_INODE_NODATASUM (1 << 0) 862#define BTRFS_INODE_NODATASUM (1 << 0)
826#define BTRFS_INODE_NODATACOW (1 << 1) 863#define BTRFS_INODE_NODATACOW (1 << 1)
827#define BTRFS_INODE_READONLY (1 << 2) 864#define BTRFS_INODE_READONLY (1 << 2)
865#define BTRFS_INODE_NOCOMPRESS (1 << 3)
828#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ 866#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
829 ~BTRFS_INODE_##flag) 867 ~BTRFS_INODE_##flag)
830#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ 868#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1424 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; 1462 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1425} 1463}
1426 1464
1427static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1428 struct btrfs_item *e)
1429{
1430 unsigned long offset;
1431 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1432 return btrfs_item_size(eb, e) - offset;
1433}
1434
1435BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, 1465BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1436 disk_bytenr, 64); 1466 disk_bytenr, 64);
1437BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, 1467BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1442 offset, 64); 1472 offset, 64);
1443BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, 1473BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1444 num_bytes, 64); 1474 num_bytes, 64);
1475BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1476 ram_bytes, 64);
1477BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1478 compression, 8);
1479BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1480 encryption, 8);
1481BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1482 other_encoding, 16);
1483
1484/* this returns the number of file bytes represented by the inline item.
1485 * If an item is compressed, this is the uncompressed size
1486 */
1487static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1488 struct btrfs_file_extent_item *e)
1489{
1490 return btrfs_file_extent_ram_bytes(eb, e);
1491}
1492
1493/*
1494 * this returns the number of bytes used by the item on disk, minus the
1495 * size of any extent headers. If a file is compressed on disk, this is
1496 * the compressed size
1497 */
1498static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1499 struct btrfs_item *e)
1500{
1501 unsigned long offset;
1502 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1503 return btrfs_item_size(eb, e) - offset;
1504}
1445 1505
1446static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 1506static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1447{ 1507{
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1745int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 1805int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1746 struct bio *bio); 1806 struct bio *bio);
1747int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 1807int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1748 struct btrfs_root *root, 1808 struct btrfs_root *root,
1749 u64 objectid, u64 pos, u64 disk_offset, 1809 u64 objectid, u64 pos,
1750 u64 disk_num_bytes, 1810 u64 disk_offset, u64 disk_num_bytes,
1751 u64 num_bytes, u64 offset); 1811 u64 num_bytes, u64 offset, u64 ram_bytes,
1812 u8 compression, u8 encryption, u16 other_encoding);
1752int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 1813int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1753 struct btrfs_root *root, 1814 struct btrfs_root *root,
1754 struct btrfs_path *path, u64 objectid, 1815 struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1758 struct btrfs_ordered_sum *sums); 1819 struct btrfs_ordered_sum *sums);
1759int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 1820int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1760 struct bio *bio); 1821 struct bio *bio);
1822int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1823 u64 start, unsigned long len);
1761struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, 1824struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1762 struct btrfs_root *root, 1825 struct btrfs_root *root,
1763 struct btrfs_path *path, 1826 struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
1799 int namelen); 1862 int namelen);
1800 1863
1801int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1864int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1802 size_t size, struct bio *bio); 1865 size_t size, struct bio *bio, unsigned long bio_flags);
1803 1866
1804unsigned long btrfs_force_ra(struct address_space *mapping, 1867unsigned long btrfs_force_ra(struct address_space *mapping,
1805 struct file_ra_state *ra, struct file *file, 1868 struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb6194..dc95f636a11b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
83 extent_submit_bio_hook_t *submit_bio_hook; 83 extent_submit_bio_hook_t *submit_bio_hook;
84 int rw; 84 int rw;
85 int mirror_num; 85 int mirror_num;
86 unsigned long bio_flags;
86 struct btrfs_work work; 87 struct btrfs_work work;
87}; 88};
88 89
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
115 } 116 }
116 em->start = 0; 117 em->start = 0;
117 em->len = (u64)-1; 118 em->len = (u64)-1;
119 em->block_len = (u64)-1;
118 em->block_start = 0; 120 em->block_start = 0;
119 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 121 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
120 122
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
469 wake_up(&fs_info->async_submit_wait); 471 wake_up(&fs_info->async_submit_wait);
470 472
471 async->submit_bio_hook(async->inode, async->rw, async->bio, 473 async->submit_bio_hook(async->inode, async->rw, async->bio,
472 async->mirror_num); 474 async->mirror_num, async->bio_flags);
473 kfree(async); 475 kfree(async);
474} 476}
475 477
476int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 478int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
477 int rw, struct bio *bio, int mirror_num, 479 int rw, struct bio *bio, int mirror_num,
480 unsigned long bio_flags,
478 extent_submit_bio_hook_t *submit_bio_hook) 481 extent_submit_bio_hook_t *submit_bio_hook)
479{ 482{
480 struct async_submit_bio *async; 483 struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
491 async->submit_bio_hook = submit_bio_hook; 494 async->submit_bio_hook = submit_bio_hook;
492 async->work.func = run_one_async_submit; 495 async->work.func = run_one_async_submit;
493 async->work.flags = 0; 496 async->work.flags = 0;
497 async->bio_flags = bio_flags;
494 498
495 while(atomic_read(&fs_info->async_submit_draining) && 499 while(atomic_read(&fs_info->async_submit_draining) &&
496 atomic_read(&fs_info->nr_async_submits)) { 500 atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
530} 534}
531 535
532static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 536static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
533 int mirror_num) 537 int mirror_num, unsigned long bio_flags)
534{ 538{
535 struct btrfs_root *root = BTRFS_I(inode)->root; 539 struct btrfs_root *root = BTRFS_I(inode)->root;
536 int ret; 540 int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
556} 560}
557 561
558static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 562static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
559 int mirror_num) 563 int mirror_num, unsigned long bio_flags)
560{ 564{
561 /* 565 /*
562 * kthread helpers are used to submit writes so that checksumming 566 * kthread helpers are used to submit writes so that checksumming
563 * can happen in parallel across all CPUs 567 * can happen in parallel across all CPUs
564 */ 568 */
565 if (!(rw & (1 << BIO_RW))) { 569 if (!(rw & (1 << BIO_RW))) {
566 return __btree_submit_bio_hook(inode, rw, bio, mirror_num); 570 return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
567 } 571 }
568 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 572 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
569 inode, rw, bio, mirror_num, 573 inode, rw, bio, mirror_num, 0,
570 __btree_submit_bio_hook); 574 __btree_submit_bio_hook);
571} 575}
572 576
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1407 fs_info->btree_inode = new_inode(sb); 1411 fs_info->btree_inode = new_inode(sb);
1408 fs_info->btree_inode->i_ino = 1; 1412 fs_info->btree_inode->i_ino = 1;
1409 fs_info->btree_inode->i_nlink = 1; 1413 fs_info->btree_inode->i_nlink = 1;
1414
1410 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); 1415 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1411 1416
1412 INIT_LIST_HEAD(&fs_info->ordered_extents); 1417 INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1508 */ 1513 */
1509 btrfs_init_workers(&fs_info->workers, "worker", 1514 btrfs_init_workers(&fs_info->workers, "worker",
1510 fs_info->thread_pool_size); 1515 fs_info->thread_pool_size);
1516
1511 btrfs_init_workers(&fs_info->submit_workers, "submit", 1517 btrfs_init_workers(&fs_info->submit_workers, "submit",
1512 min_t(u64, fs_devices->num_devices, 1518 min_t(u64, fs_devices->num_devices,
1513 fs_info->thread_pool_size)); 1519 fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1559 } 1565 }
1560 1566
1561 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1567 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1568 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1569 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1562 1570
1563 nodesize = btrfs_super_nodesize(disk_super); 1571 nodesize = btrfs_super_nodesize(disk_super);
1564 leafsize = btrfs_super_leafsize(disk_super); 1572 leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbbb..4eb1f1408d21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
71 int metadata); 71 int metadata);
72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
73 int rw, struct bio *bio, int mirror_num, 73 int rw, struct bio *bio, int mirror_num,
74 unsigned long bio_flags,
74 extent_submit_bio_hook_t *submit_bio_hook); 75 extent_submit_bio_hook_t *submit_bio_hook);
75int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); 76int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
76unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 77unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6d..bbf04e80a1a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
3278 3278
3279 em->start = extent_key->objectid - offset; 3279 em->start = extent_key->objectid - offset;
3280 em->len = extent_key->offset; 3280 em->len = extent_key->offset;
3281 em->block_len = extent_key->offset;
3281 em->block_start = extent_key->objectid; 3282 em->block_start = extent_key->objectid;
3282 em->bdev = root->fs_info->fs_devices->latest_bdev; 3283 em->bdev = root->fs_info->fs_devices->latest_bdev;
3283 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3284 set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
3314}; 3315};
3315 3316
3316struct disk_extent { 3317struct disk_extent {
3318 u64 ram_bytes;
3317 u64 disk_bytenr; 3319 u64 disk_bytenr;
3318 u64 disk_num_bytes; 3320 u64 disk_num_bytes;
3319 u64 offset; 3321 u64 offset;
3320 u64 num_bytes; 3322 u64 num_bytes;
3323 u8 compression;
3324 u8 encryption;
3325 u16 other_encoding;
3321}; 3326};
3322 3327
3323static int is_cowonly_root(u64 root_objectid) 3328static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
3631 btrfs_file_extent_disk_num_bytes(leaf, fi); 3636 btrfs_file_extent_disk_num_bytes(leaf, fi);
3632 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 3637 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
3633 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 3638 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
3639 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
3640 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
3641 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
3642 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
3643 fi);
3634 WARN_ON(exts[nr].offset > 0); 3644 WARN_ON(exts[nr].offset > 0);
3635 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 3645 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
3636 3646
@@ -3846,6 +3856,8 @@ next:
3846 new_extents[0].disk_bytenr); 3856 new_extents[0].disk_bytenr);
3847 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 3857 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3848 new_extents[0].disk_num_bytes); 3858 new_extents[0].disk_num_bytes);
3859 btrfs_set_file_extent_ram_bytes(leaf, fi,
3860 new_extents[0].ram_bytes);
3849 ext_offset += new_extents[0].offset; 3861 ext_offset += new_extents[0].offset;
3850 btrfs_set_file_extent_offset(leaf, fi, ext_offset); 3862 btrfs_set_file_extent_offset(leaf, fi, ext_offset);
3851 btrfs_mark_buffer_dirty(leaf); 3863 btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
3911 new_extents[i].disk_bytenr); 3923 new_extents[i].disk_bytenr);
3912 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 3924 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
3913 new_extents[i].disk_num_bytes); 3925 new_extents[i].disk_num_bytes);
3926 btrfs_set_file_extent_ram_bytes(leaf, fi,
3927 new_extents[i].ram_bytes);
3928
3929 btrfs_set_file_extent_compression(leaf, fi,
3930 new_extents[i].compression);
3931 btrfs_set_file_extent_encryption(leaf, fi,
3932 new_extents[i].encryption);
3933 btrfs_set_file_extent_other_encoding(leaf, fi,
3934 new_extents[i].other_encoding);
3935
3914 btrfs_set_file_extent_num_bytes(leaf, fi, 3936 btrfs_set_file_extent_num_bytes(leaf, fi,
3915 extent_len); 3937 extent_len);
3916 ext_offset += new_extents[i].offset; 3938 ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4169 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 4191 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4170 4192
4171 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 4193 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
4194 btrfs_set_file_extent_ram_bytes(leaf, fi,
4195 new_extent->ram_bytes);
4172 btrfs_set_file_extent_disk_bytenr(leaf, fi, 4196 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4173 new_extent->disk_bytenr); 4197 new_extent->disk_bytenr);
4174 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 4198 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
4847 BUG_ON(err); 4871 BUG_ON(err);
4848 4872
4849 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, 4873 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
4850 group->key.offset, 0); 4874 group->key.offset, 0, group->key.offset,
4875 0, 0, 0);
4851 BUG_ON(err); 4876 BUG_ON(err);
4852 4877
4853 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 4878 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f2..314041fdfa43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
30static LIST_HEAD(buffers); 30static LIST_HEAD(buffers);
31static LIST_HEAD(states); 31static LIST_HEAD(states);
32 32
33#define LEAK_DEBUG 1
33#ifdef LEAK_DEBUG 34#ifdef LEAK_DEBUG
34static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; 35static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35#endif 36#endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
1067 * 1068 *
1068 * 1 is returned if we find something, 0 if nothing was in the tree 1069 * 1 is returned if we find something, 0 if nothing was in the tree
1069 */ 1070 */
1070static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, 1071static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1071 u64 *start, u64 *end, u64 max_bytes) 1072 u64 *start, u64 *end, u64 max_bytes)
1072{ 1073{
1073 struct rb_node *node; 1074 struct rb_node *node;
1074 struct extent_state *state; 1075 struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1077 u64 total_bytes = 0; 1078 u64 total_bytes = 0;
1078 1079
1079 spin_lock_irq(&tree->lock); 1080 spin_lock_irq(&tree->lock);
1081
1080 /* 1082 /*
1081 * this search will find all the extents that end after 1083 * this search will find all the extents that end after
1082 * our range starts. 1084 * our range starts.
1083 */ 1085 */
1084search_again:
1085 node = tree_search(tree, cur_start); 1086 node = tree_search(tree, cur_start);
1086 if (!node) { 1087 if (!node) {
1087 if (!found) 1088 if (!found)
@@ -1100,40 +1101,6 @@ search_again:
1100 *end = state->end; 1101 *end = state->end;
1101 goto out; 1102 goto out;
1102 } 1103 }
1103 if (!found && !(state->state & EXTENT_BOUNDARY)) {
1104 struct extent_state *prev_state;
1105 struct rb_node *prev_node = node;
1106 while(1) {
1107 prev_node = rb_prev(prev_node);
1108 if (!prev_node)
1109 break;
1110 prev_state = rb_entry(prev_node,
1111 struct extent_state,
1112 rb_node);
1113 if ((prev_state->end + 1 != state->start) ||
1114 !(prev_state->state & EXTENT_DELALLOC))
1115 break;
1116 if ((cur_start - prev_state->start) * 2 >
1117 max_bytes)
1118 break;
1119 state = prev_state;
1120 node = prev_node;
1121 }
1122 }
1123 if (state->state & EXTENT_LOCKED) {
1124 DEFINE_WAIT(wait);
1125 atomic_inc(&state->refs);
1126 prepare_to_wait(&state->wq, &wait,
1127 TASK_UNINTERRUPTIBLE);
1128 spin_unlock_irq(&tree->lock);
1129 schedule();
1130 spin_lock_irq(&tree->lock);
1131 finish_wait(&state->wq, &wait);
1132 free_extent_state(state);
1133 goto search_again;
1134 }
1135 set_state_cb(tree, state, EXTENT_LOCKED);
1136 state->state |= EXTENT_LOCKED;
1137 if (!found) 1104 if (!found)
1138 *start = state->start; 1105 *start = state->start;
1139 found++; 1106 found++;
@@ -1151,6 +1118,208 @@ out:
1151 return found; 1118 return found;
1152} 1119}
1153 1120
1121static noinline int __unlock_for_delalloc(struct inode *inode,
1122 struct page *locked_page,
1123 u64 start, u64 end)
1124{
1125 int ret;
1126 struct page *pages[16];
1127 unsigned long index = start >> PAGE_CACHE_SHIFT;
1128 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1129 unsigned long nr_pages = end_index - index + 1;
1130 int i;
1131
1132 if (index == locked_page->index && end_index == index)
1133 return 0;
1134
1135 while(nr_pages > 0) {
1136 ret = find_get_pages_contig(inode->i_mapping, index,
1137 min(nr_pages, ARRAY_SIZE(pages)), pages);
1138 for (i = 0; i < ret; i++) {
1139 if (pages[i] != locked_page)
1140 unlock_page(pages[i]);
1141 page_cache_release(pages[i]);
1142 }
1143 nr_pages -= ret;
1144 index += ret;
1145 cond_resched();
1146 }
1147 return 0;
1148}
1149
1150static noinline int lock_delalloc_pages(struct inode *inode,
1151 struct page *locked_page,
1152 u64 delalloc_start,
1153 u64 delalloc_end)
1154{
1155 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1156 unsigned long start_index = index;
1157 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1158 unsigned long pages_locked = 0;
1159 struct page *pages[16];
1160 unsigned long nrpages;
1161 int ret;
1162 int i;
1163
1164 /* the caller is responsible for locking the start index */
1165 if (index == locked_page->index && index == end_index)
1166 return 0;
1167
1168 /* skip the page at the start index */
1169 nrpages = end_index - index + 1;
1170 while(nrpages > 0) {
1171 ret = find_get_pages_contig(inode->i_mapping, index,
1172 min(nrpages, ARRAY_SIZE(pages)), pages);
1173 if (ret == 0) {
1174 ret = -EAGAIN;
1175 goto done;
1176 }
1177 /* now we have an array of pages, lock them all */
1178 for (i = 0; i < ret; i++) {
1179 /*
1180 * the caller is taking responsibility for
1181 * locked_page
1182 */
1183 if (pages[i] != locked_page)
1184 lock_page(pages[i]);
1185 page_cache_release(pages[i]);
1186 }
1187 pages_locked += ret;
1188 nrpages -= ret;
1189 index += ret;
1190 cond_resched();
1191 }
1192 ret = 0;
1193done:
1194 if (ret && pages_locked) {
1195 __unlock_for_delalloc(inode, locked_page,
1196 delalloc_start,
1197 ((u64)(start_index + pages_locked - 1)) <<
1198 PAGE_CACHE_SHIFT);
1199 }
1200 return ret;
1201}
1202
1203/*
1204 * find a contiguous range of bytes in the file marked as delalloc, not
1205 * more than 'max_bytes'. start and end are used to return the range,
1206 *
1207 * 1 is returned if we find something, 0 if nothing was in the tree
1208 */
1209static noinline u64 find_lock_delalloc_range(struct inode *inode,
1210 struct extent_io_tree *tree,
1211 struct page *locked_page,
1212 u64 *start, u64 *end,
1213 u64 max_bytes)
1214{
1215 u64 delalloc_start;
1216 u64 delalloc_end;
1217 u64 found;
1218 int ret;
1219 int loops = 0;
1220
1221again:
1222 /* step one, find a bunch of delalloc bytes starting at start */
1223 delalloc_start = *start;
1224 delalloc_end = 0;
1225 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1226 max_bytes);
1227 if (!found) {
1228 *start = delalloc_start;
1229 *end = delalloc_end;
1230 return found;
1231 }
1232
1233 /*
1234 * make sure to limit the number of pages we try to lock down
1235 * if we're looping.
1236 */
1237 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1238 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
1239 ~((u64)PAGE_CACHE_SIZE - 1);
1240 }
1241 /* step two, lock all the pages after the page that has start */
1242 ret = lock_delalloc_pages(inode, locked_page,
1243 delalloc_start, delalloc_end);
1244 if (ret == -EAGAIN) {
1245 /* some of the pages are gone, lets avoid looping by
1246 * shortening the size of the delalloc range we're searching
1247 */
1248 if (!loops) {
1249 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1250 max_bytes = PAGE_CACHE_SIZE - offset;
1251 loops = 1;
1252 goto again;
1253 } else {
1254 found = 0;
1255 goto out_failed;
1256 }
1257 }
1258 BUG_ON(ret);
1259
1260 /* step three, lock the state bits for the whole range */
1261 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1262
1263 /* then test to make sure it is all still delalloc */
1264 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1265 EXTENT_DELALLOC, 1);
1266 if (!ret) {
1267 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1268 __unlock_for_delalloc(inode, locked_page,
1269 delalloc_start, delalloc_end);
1270 cond_resched();
1271 goto again;
1272 }
1273 *start = delalloc_start;
1274 *end = delalloc_end;
1275out_failed:
1276 return found;
1277}
1278
1279int extent_clear_unlock_delalloc(struct inode *inode,
1280 struct extent_io_tree *tree,
1281 u64 start, u64 end, struct page *locked_page,
1282 int clear_dirty, int set_writeback,
1283 int end_writeback)
1284{
1285 int ret;
1286 struct page *pages[16];
1287 unsigned long index = start >> PAGE_CACHE_SHIFT;
1288 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1289 unsigned long nr_pages = end_index - index + 1;
1290 int i;
1291 int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1292
1293 if (clear_dirty)
1294 clear_bits |= EXTENT_DIRTY;
1295
1296 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1297
1298 while(nr_pages > 0) {
1299 ret = find_get_pages_contig(inode->i_mapping, index,
1300 min(nr_pages, ARRAY_SIZE(pages)), pages);
1301 for (i = 0; i < ret; i++) {
1302 if (pages[i] == locked_page) {
1303 page_cache_release(pages[i]);
1304 continue;
1305 }
1306 if (clear_dirty)
1307 clear_page_dirty_for_io(pages[i]);
1308 if (set_writeback)
1309 set_page_writeback(pages[i]);
1310 if (end_writeback)
1311 end_page_writeback(pages[i]);
1312 unlock_page(pages[i]);
1313 page_cache_release(pages[i]);
1314 }
1315 nr_pages -= ret;
1316 index += ret;
1317 cond_resched();
1318 }
1319 return 0;
1320}
1321EXPORT_SYMBOL(extent_clear_unlock_delalloc);
1322
1154/* 1323/*
1155 * count the number of bytes in the tree that have a given bit(s) 1324 * count the number of bytes in the tree that have a given bit(s)
1156 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1325 * set. This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1631 return bio; 1800 return bio;
1632} 1801}
1633 1802
1634static int submit_one_bio(int rw, struct bio *bio, int mirror_num) 1803static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1804 unsigned long bio_flags)
1635{ 1805{
1636 int ret = 0; 1806 int ret = 0;
1637 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1807 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1638 struct page *page = bvec->bv_page; 1808 struct page *page = bvec->bv_page;
1639 struct extent_io_tree *tree = bio->bi_private; 1809 struct extent_io_tree *tree = bio->bi_private;
1640 struct rb_node *node;
1641 struct extent_state *state;
1642 u64 start; 1810 u64 start;
1643 u64 end; 1811 u64 end;
1644 1812
1645 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1813 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1646 end = start + bvec->bv_len - 1; 1814 end = start + bvec->bv_len - 1;
1647 1815
1648 spin_lock_irq(&tree->lock);
1649 node = __etree_search(tree, start, NULL, NULL);
1650 BUG_ON(!node);
1651 state = rb_entry(node, struct extent_state, rb_node);
1652 while(state->end < end) {
1653 node = rb_next(node);
1654 state = rb_entry(node, struct extent_state, rb_node);
1655 }
1656 BUG_ON(state->end != end);
1657 spin_unlock_irq(&tree->lock);
1658
1659 bio->bi_private = NULL; 1816 bio->bi_private = NULL;
1660 1817
1661 bio_get(bio); 1818 bio_get(bio);
1662 1819
1663 if (tree->ops && tree->ops->submit_bio_hook) 1820 if (tree->ops && tree->ops->submit_bio_hook)
1664 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1821 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1665 mirror_num); 1822 mirror_num, bio_flags);
1666 else 1823 else
1667 submit_bio(rw, bio); 1824 submit_bio(rw, bio);
1668 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1825 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1678 struct bio **bio_ret, 1835 struct bio **bio_ret,
1679 unsigned long max_pages, 1836 unsigned long max_pages,
1680 bio_end_io_t end_io_func, 1837 bio_end_io_t end_io_func,
1681 int mirror_num) 1838 int mirror_num,
1839 unsigned long prev_bio_flags,
1840 unsigned long bio_flags)
1682{ 1841{
1683 int ret = 0; 1842 int ret = 0;
1684 struct bio *bio; 1843 struct bio *bio;
1685 int nr; 1844 int nr;
1845 int contig = 0;
1846 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1847 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1848 size_t page_size = min(size, PAGE_CACHE_SIZE);
1686 1849
1687 if (bio_ret && *bio_ret) { 1850 if (bio_ret && *bio_ret) {
1688 bio = *bio_ret; 1851 bio = *bio_ret;
1689 if (bio->bi_sector + (bio->bi_size >> 9) != sector || 1852 if (old_compressed)
1853 contig = bio->bi_sector == sector;
1854 else
1855 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1856 sector;
1857
1858 if (prev_bio_flags != bio_flags || !contig ||
1690 (tree->ops && tree->ops->merge_bio_hook && 1859 (tree->ops && tree->ops->merge_bio_hook &&
1691 tree->ops->merge_bio_hook(page, offset, size, bio)) || 1860 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1692 bio_add_page(bio, page, size, offset) < size) { 1861 bio_flags)) ||
1693 ret = submit_one_bio(rw, bio, mirror_num); 1862 bio_add_page(bio, page, page_size, offset) < page_size) {
1863 ret = submit_one_bio(rw, bio, mirror_num,
1864 prev_bio_flags);
1694 bio = NULL; 1865 bio = NULL;
1695 } else { 1866 } else {
1696 return 0; 1867 return 0;
1697 } 1868 }
1698 } 1869 }
1699 nr = bio_get_nr_vecs(bdev); 1870 if (this_compressed)
1871 nr = BIO_MAX_PAGES;
1872 else
1873 nr = bio_get_nr_vecs(bdev);
1874
1700 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1875 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1701 if (!bio) { 1876 if (!bio) {
1702 printk("failed to allocate bio nr %d\n", nr); 1877 printk("failed to allocate bio nr %d\n", nr);
1703 } 1878 }
1704 1879
1705 1880 bio_add_page(bio, page, page_size, offset);
1706 bio_add_page(bio, page, size, offset);
1707 bio->bi_end_io = end_io_func; 1881 bio->bi_end_io = end_io_func;
1708 bio->bi_private = tree; 1882 bio->bi_private = tree;
1709 1883
1710 if (bio_ret) { 1884 if (bio_ret) {
1711 *bio_ret = bio; 1885 *bio_ret = bio;
1712 } else { 1886 } else {
1713 ret = submit_one_bio(rw, bio, mirror_num); 1887 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1714 } 1888 }
1715 1889
1716 return ret; 1890 return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
1738static int __extent_read_full_page(struct extent_io_tree *tree, 1912static int __extent_read_full_page(struct extent_io_tree *tree,
1739 struct page *page, 1913 struct page *page,
1740 get_extent_t *get_extent, 1914 get_extent_t *get_extent,
1741 struct bio **bio, int mirror_num) 1915 struct bio **bio, int mirror_num,
1916 unsigned long *bio_flags)
1742{ 1917{
1743 struct inode *inode = page->mapping->host; 1918 struct inode *inode = page->mapping->host;
1744 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1919 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
1756 int nr = 0; 1931 int nr = 0;
1757 size_t page_offset = 0; 1932 size_t page_offset = 0;
1758 size_t iosize; 1933 size_t iosize;
1934 size_t disk_io_size;
1759 size_t blocksize = inode->i_sb->s_blocksize; 1935 size_t blocksize = inode->i_sb->s_blocksize;
1936 unsigned long this_bio_flag = 0;
1760 1937
1761 set_page_extent_mapped(page); 1938 set_page_extent_mapped(page);
1762 1939
1763 end = page_end; 1940 end = page_end;
1764 lock_extent(tree, start, end, GFP_NOFS); 1941 lock_extent(tree, start, end, GFP_NOFS);
1765 1942
1943 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1944 char *userpage;
1945 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1946
1947 if (zero_offset) {
1948 iosize = PAGE_CACHE_SIZE - zero_offset;
1949 userpage = kmap_atomic(page, KM_USER0);
1950 memset(userpage + zero_offset, 0, iosize);
1951 flush_dcache_page(page);
1952 kunmap_atomic(userpage, KM_USER0);
1953 }
1954 }
1766 while (cur <= end) { 1955 while (cur <= end) {
1767 if (cur >= last_byte) { 1956 if (cur >= last_byte) {
1768 char *userpage; 1957 char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1793 } 1982 }
1794 BUG_ON(end < cur); 1983 BUG_ON(end < cur);
1795 1984
1985 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1986 this_bio_flag = EXTENT_BIO_COMPRESSED;
1987
1796 iosize = min(extent_map_end(em) - cur, end - cur + 1); 1988 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1797 cur_end = min(extent_map_end(em) - 1, end); 1989 cur_end = min(extent_map_end(em) - 1, end);
1798 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1990 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1799 sector = (em->block_start + extent_offset) >> 9; 1991 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
1992 disk_io_size = em->block_len;
1993 sector = em->block_start >> 9;
1994 } else {
1995 sector = (em->block_start + extent_offset) >> 9;
1996 disk_io_size = iosize;
1997 }
1800 bdev = em->bdev; 1998 bdev = em->bdev;
1801 block_start = em->block_start; 1999 block_start = em->block_start;
1802 free_extent_map(em); 2000 free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1845 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2043 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1846 pnr -= page->index; 2044 pnr -= page->index;
1847 ret = submit_extent_page(READ, tree, page, 2045 ret = submit_extent_page(READ, tree, page,
1848 sector, iosize, page_offset, 2046 sector, disk_io_size, page_offset,
1849 bdev, bio, pnr, 2047 bdev, bio, pnr,
1850 end_bio_extent_readpage, mirror_num); 2048 end_bio_extent_readpage, mirror_num,
2049 *bio_flags,
2050 this_bio_flag);
1851 nr++; 2051 nr++;
2052 *bio_flags = this_bio_flag;
1852 } 2053 }
1853 if (ret) 2054 if (ret)
1854 SetPageError(page); 2055 SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1867 get_extent_t *get_extent) 2068 get_extent_t *get_extent)
1868{ 2069{
1869 struct bio *bio = NULL; 2070 struct bio *bio = NULL;
2071 unsigned long bio_flags = 0;
1870 int ret; 2072 int ret;
1871 2073
1872 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); 2074 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2075 &bio_flags);
1873 if (bio) 2076 if (bio)
1874 submit_one_bio(READ, bio, 0); 2077 submit_one_bio(READ, bio, 0, bio_flags);
1875 return ret; 2078 return ret;
1876} 2079}
1877EXPORT_SYMBOL(extent_read_full_page); 2080EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1909 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2112 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1910 u64 nr_delalloc; 2113 u64 nr_delalloc;
1911 u64 delalloc_end; 2114 u64 delalloc_end;
2115 int page_started;
2116 int compressed;
1912 2117
1913 WARN_ON(!PageLocked(page)); 2118 WARN_ON(!PageLocked(page));
1914 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2119 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1934 2139
1935 delalloc_start = start; 2140 delalloc_start = start;
1936 delalloc_end = 0; 2141 delalloc_end = 0;
2142 page_started = 0;
1937 while(delalloc_end < page_end) { 2143 while(delalloc_end < page_end) {
1938 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, 2144 nr_delalloc = find_lock_delalloc_range(inode, tree,
2145 page,
2146 &delalloc_start,
1939 &delalloc_end, 2147 &delalloc_end,
1940 128 * 1024 * 1024); 2148 128 * 1024 * 1024);
1941 if (nr_delalloc == 0) { 2149 if (nr_delalloc == 0) {
1942 delalloc_start = delalloc_end + 1; 2150 delalloc_start = delalloc_end + 1;
1943 continue; 2151 continue;
1944 } 2152 }
1945 tree->ops->fill_delalloc(inode, delalloc_start, 2153 tree->ops->fill_delalloc(inode, page, delalloc_start,
1946 delalloc_end); 2154 delalloc_end, &page_started);
1947 clear_extent_bit(tree, delalloc_start,
1948 delalloc_end,
1949 EXTENT_LOCKED | EXTENT_DELALLOC,
1950 1, 0, GFP_NOFS);
1951 delalloc_start = delalloc_end + 1; 2155 delalloc_start = delalloc_end + 1;
1952 } 2156 }
2157
2158 /* did the fill delalloc function already unlock and start the IO? */
2159 if (page_started) {
2160 return 0;
2161 }
2162
1953 lock_extent(tree, start, page_end, GFP_NOFS); 2163 lock_extent(tree, start, page_end, GFP_NOFS);
1954 unlock_start = start; 2164 unlock_start = start;
1955 2165
1956 if (tree->ops && tree->ops->writepage_start_hook) { 2166 if (tree->ops && tree->ops->writepage_start_hook) {
1957 ret = tree->ops->writepage_start_hook(page, start, page_end); 2167 ret = tree->ops->writepage_start_hook(page, start,
2168 page_end);
1958 if (ret == -EAGAIN) { 2169 if (ret == -EAGAIN) {
1959 unlock_extent(tree, start, page_end, GFP_NOFS); 2170 unlock_extent(tree, start, page_end, GFP_NOFS);
1960 redirty_page_for_writepage(wbc, page); 2171 redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2006 sector = (em->block_start + extent_offset) >> 9; 2217 sector = (em->block_start + extent_offset) >> 9;
2007 bdev = em->bdev; 2218 bdev = em->bdev;
2008 block_start = em->block_start; 2219 block_start = em->block_start;
2220 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2009 free_extent_map(em); 2221 free_extent_map(em);
2010 em = NULL; 2222 em = NULL;
2011 2223
2012 if (block_start == EXTENT_MAP_HOLE || 2224 /*
2225 * compressed and inline extents are written through other
2226 * paths in the FS
2227 */
2228 if (compressed || block_start == EXTENT_MAP_HOLE ||
2013 block_start == EXTENT_MAP_INLINE) { 2229 block_start == EXTENT_MAP_INLINE) {
2014 clear_extent_dirty(tree, cur, 2230 clear_extent_dirty(tree, cur,
2015 cur + iosize - 1, GFP_NOFS); 2231 cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2017 unlock_extent(tree, unlock_start, cur + iosize -1, 2233 unlock_extent(tree, unlock_start, cur + iosize -1,
2018 GFP_NOFS); 2234 GFP_NOFS);
2019 2235
2020 if (tree->ops && tree->ops->writepage_end_io_hook) 2236 /*
2237 * end_io notification does not happen here for
2238 * compressed extents
2239 */
2240 if (!compressed && tree->ops &&
2241 tree->ops->writepage_end_io_hook)
2021 tree->ops->writepage_end_io_hook(page, cur, 2242 tree->ops->writepage_end_io_hook(page, cur,
2022 cur + iosize - 1, 2243 cur + iosize - 1,
2023 NULL, 1); 2244 NULL, 1);
2024 cur = cur + iosize; 2245 else if (compressed) {
2246 /* we don't want to end_page_writeback on
2247 * a compressed extent. this happens
2248 * elsewhere
2249 */
2250 nr++;
2251 }
2252
2253 cur += iosize;
2025 pg_offset += iosize; 2254 pg_offset += iosize;
2026 unlock_start = cur; 2255 unlock_start = cur;
2027 continue; 2256 continue;
2028 } 2257 }
2029
2030 /* leave this out until we have a page_mkwrite call */ 2258 /* leave this out until we have a page_mkwrite call */
2031 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2259 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2032 EXTENT_DIRTY, 0)) { 2260 EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2034 pg_offset += iosize; 2262 pg_offset += iosize;
2035 continue; 2263 continue;
2036 } 2264 }
2265
2037 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 2266 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2038 if (tree->ops && tree->ops->writepage_io_hook) { 2267 if (tree->ops && tree->ops->writepage_io_hook) {
2039 ret = tree->ops->writepage_io_hook(page, cur, 2268 ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2057 ret = submit_extent_page(WRITE, tree, page, sector, 2286 ret = submit_extent_page(WRITE, tree, page, sector,
2058 iosize, pg_offset, bdev, 2287 iosize, pg_offset, bdev,
2059 &epd->bio, max_nr, 2288 &epd->bio, max_nr,
2060 end_bio_extent_writepage, 0); 2289 end_bio_extent_writepage,
2290 0, 0, 0);
2061 if (ret) 2291 if (ret)
2062 SetPageError(page); 2292 SetPageError(page);
2063 } 2293 }
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2226 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2456 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2227 __extent_writepage, &epd); 2457 __extent_writepage, &epd);
2228 if (epd.bio) { 2458 if (epd.bio) {
2229 submit_one_bio(WRITE, epd.bio, 0); 2459 submit_one_bio(WRITE, epd.bio, 0, 0);
2230 } 2460 }
2231 return ret; 2461 return ret;
2232} 2462}
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
2248 ret = extent_write_cache_pages(tree, mapping, wbc, 2478 ret = extent_write_cache_pages(tree, mapping, wbc,
2249 __extent_writepage, &epd); 2479 __extent_writepage, &epd);
2250 if (epd.bio) { 2480 if (epd.bio) {
2251 submit_one_bio(WRITE, epd.bio, 0); 2481 submit_one_bio(WRITE, epd.bio, 0, 0);
2252 } 2482 }
2253 return ret; 2483 return ret;
2254} 2484}
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
2262 struct bio *bio = NULL; 2492 struct bio *bio = NULL;
2263 unsigned page_idx; 2493 unsigned page_idx;
2264 struct pagevec pvec; 2494 struct pagevec pvec;
2495 unsigned long bio_flags = 0;
2265 2496
2266 pagevec_init(&pvec, 0); 2497 pagevec_init(&pvec, 0);
2267 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2498 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
2281 if (!pagevec_add(&pvec, page)) 2512 if (!pagevec_add(&pvec, page))
2282 __pagevec_lru_add(&pvec); 2513 __pagevec_lru_add(&pvec);
2283 __extent_read_full_page(tree, page, get_extent, 2514 __extent_read_full_page(tree, page, get_extent,
2284 &bio, 0); 2515 &bio, 0, &bio_flags);
2285 } 2516 }
2286 page_cache_release(page); 2517 page_cache_release(page);
2287 } 2518 }
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
2289 __pagevec_lru_add(&pvec); 2520 __pagevec_lru_add(&pvec);
2290 BUG_ON(!list_empty(pages)); 2521 BUG_ON(!list_empty(pages));
2291 if (bio) 2522 if (bio)
2292 submit_one_bio(READ, bio, 0); 2523 submit_one_bio(READ, bio, 0, bio_flags);
2293 return 0; 2524 return 0;
2294} 2525}
2295EXPORT_SYMBOL(extent_readpages); 2526EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2414 ret = submit_extent_page(READ, tree, page, 2645 ret = submit_extent_page(READ, tree, page,
2415 sector, iosize, page_offset, em->bdev, 2646 sector, iosize, page_offset, em->bdev,
2416 NULL, 1, 2647 NULL, 1,
2417 end_bio_extent_preparewrite, 0); 2648 end_bio_extent_preparewrite, 0,
2649 0, 0);
2418 iocount++; 2650 iocount++;
2419 block_start = block_start + iosize; 2651 block_start = block_start + iosize;
2420 } else { 2652 } else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2495 } 2727 }
2496 if (!test_range_bit(tree, em->start, 2728 if (!test_range_bit(tree, em->start,
2497 extent_map_end(em) - 1, 2729 extent_map_end(em) - 1,
2498 EXTENT_LOCKED, 0)) { 2730 EXTENT_LOCKED | EXTENT_WRITEBACK |
2731 EXTENT_ORDERED,
2732 0)) {
2499 remove_extent_mapping(map, em); 2733 remove_extent_mapping(map, em);
2500 /* once for the rb tree */ 2734 /* once for the rb tree */
2501 free_extent_map(em); 2735 free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2923 int inc_all_pages = 0; 3157 int inc_all_pages = 0;
2924 unsigned long num_pages; 3158 unsigned long num_pages;
2925 struct bio *bio = NULL; 3159 struct bio *bio = NULL;
3160 unsigned long bio_flags = 0;
2926 3161
2927 if (eb->flags & EXTENT_UPTODATE) 3162 if (eb->flags & EXTENT_UPTODATE)
2928 return 0; 3163 return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2973 ClearPageError(page); 3208 ClearPageError(page);
2974 err = __extent_read_full_page(tree, page, 3209 err = __extent_read_full_page(tree, page,
2975 get_extent, &bio, 3210 get_extent, &bio,
2976 mirror_num); 3211 mirror_num, &bio_flags);
2977 if (err) { 3212 if (err) {
2978 ret = err; 3213 ret = err;
2979 printk("err %d from __extent_read_full_page\n", ret); 3214 printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2984 } 3219 }
2985 3220
2986 if (bio) 3221 if (bio)
2987 submit_one_bio(READ, bio, mirror_num); 3222 submit_one_bio(READ, bio, mirror_num, bio_flags);
2988 3223
2989 if (ret || !wait) { 3224 if (ret || !wait) {
2990 if (ret) 3225 if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae3..86f859b87a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
18#define EXTENT_BOUNDARY (1 << 11) 18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
20 20
21/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1
23
21/* 24/*
22 * page->private values. Every page that is controlled by the extent 25 * page->private values. Every page that is controlled by the extent
23 * map has page->private set to one. 26 * map has page->private set to one.
@@ -28,14 +31,17 @@
28struct extent_state; 31struct extent_state;
29 32
30typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 33typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
31 struct bio *bio, int mirror_num); 34 struct bio *bio, int mirror_num,
35 unsigned long bio_flags);
32struct extent_io_ops { 36struct extent_io_ops {
33 int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); 37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
38 u64 start, u64 end, int *page_started);
34 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 39 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
35 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 40 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
36 extent_submit_bio_hook_t *submit_bio_hook; 41 extent_submit_bio_hook_t *submit_bio_hook;
37 int (*merge_bio_hook)(struct page *page, unsigned long offset, 42 int (*merge_bio_hook)(struct page *page, unsigned long offset,
38 size_t size, struct bio *bio); 43 size_t size, struct bio *bio,
44 unsigned long bio_flags);
39 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 45 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
40 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 46 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
41 u64 start, u64 end, 47 u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
245int release_extent_buffer_tail_pages(struct extent_buffer *eb); 251int release_extent_buffer_tail_pages(struct extent_buffer *eb);
246int extent_range_uptodate(struct extent_io_tree *tree, 252int extent_range_uptodate(struct extent_io_tree *tree,
247 u64 start, u64 end); 253 u64 start, u64 end);
254int extent_clear_unlock_delalloc(struct inode *inode,
255 struct extent_io_tree *tree,
256 u64 start, u64 end, struct page *locked_page,
257 int clear_dirty, int set_writeback,
258 int clear_writeback);
248#endif 259#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d3..fd3ebfb8c3c5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) 184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0; 185 return 0;
186 186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
187 if (extent_map_end(prev) == next->start && 194 if (extent_map_end(prev) == next->start &&
188 prev->flags == next->flags && 195 prev->flags == next->flags &&
189 prev->bdev == next->bdev && 196 prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
239 if (rb && mergable_maps(merge, em)) { 246 if (rb && mergable_maps(merge, em)) {
240 em->start = merge->start; 247 em->start = merge->start;
241 em->len += merge->len; 248 em->len += merge->len;
249 em->block_len += merge->block_len;
242 em->block_start = merge->block_start; 250 em->block_start = merge->block_start;
243 merge->in_tree = 0; 251 merge->in_tree = 0;
244 rb_erase(&merge->rb_node, &tree->map); 252 rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
250 merge = rb_entry(rb, struct extent_map, rb_node); 258 merge = rb_entry(rb, struct extent_map, rb_node);
251 if (rb && mergable_maps(em, merge)) { 259 if (rb && mergable_maps(em, merge)) {
252 em->len += merge->len; 260 em->len += merge->len;
261 em->block_len += merge->len;
253 rb_erase(&merge->rb_node, &tree->map); 262 rb_erase(&merge->rb_node, &tree->map);
254 merge->in_tree = 0; 263 merge->in_tree = 0;
255 free_extent_map(merge); 264 free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b268..abbcbeb28c79 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
10 10
11/* bits for the flags field */ 11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ 12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
13 14
14struct extent_map { 15struct extent_map {
15 struct rb_node rb_node; 16 struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
18 u64 start; 19 u64 start;
19 u64 len; 20 u64 len;
20 u64 block_start; 21 u64 block_start;
22 u64 block_len;
21 unsigned long flags; 23 unsigned long flags;
22 struct block_device *bdev; 24 struct block_device *bdev;
23 atomic_t refs; 25 atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
38 40
39static inline u64 extent_map_block_end(struct extent_map *em) 41static inline u64 extent_map_block_end(struct extent_map *em)
40{ 42{
41 if (em->block_start + em->len < em->block_start) 43 if (em->block_start + em->block_len < em->block_start)
42 return (u64)-1; 44 return (u64)-1;
43 return em->block_start + em->len; 45 return em->block_start + em->block_len;
44} 46}
45 47
46void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); 48void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d4..f4d3fa71bc41 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, 31 struct btrfs_root *root,
32 u64 objectid, u64 pos, 32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes, 33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset) 34 u64 num_bytes, u64 offset, u64 ram_bytes,
35 u8 compression, u8 encryption, u16 other_encoding)
35{ 36{
36 int ret = 0; 37 int ret = 0;
37 struct btrfs_file_extent_item *item; 38 struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
57 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); 58 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
58 btrfs_set_file_extent_offset(leaf, item, offset); 59 btrfs_set_file_extent_offset(leaf, item, offset);
59 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); 60 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
61 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
60 btrfs_set_file_extent_generation(leaf, item, trans->transid); 62 btrfs_set_file_extent_generation(leaf, item, trans->transid);
61 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 63 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
64 btrfs_set_file_extent_compression(leaf, item, compression);
65 btrfs_set_file_extent_encryption(leaf, item, encryption);
66 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
67
62 btrfs_mark_buffer_dirty(leaf); 68 btrfs_mark_buffer_dirty(leaf);
63out: 69out:
64 btrfs_free_path(path); 70 btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
213 return 0; 219 return 0;
214} 220}
215 221
222int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
223 u64 start, unsigned long len)
224{
225 struct btrfs_ordered_sum *sums;
226 struct btrfs_sector_sum *sector_sum;
227 struct btrfs_ordered_extent *ordered;
228 char *data;
229 struct page *page;
230 unsigned long total_bytes = 0;
231 unsigned long this_sum_bytes = 0;
232
233 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
234 if (!sums)
235 return -ENOMEM;
236
237 sector_sum = sums->sums;
238 sums->file_offset = start;
239 sums->len = len;
240 INIT_LIST_HEAD(&sums->list);
241 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
242 BUG_ON(!ordered);
243
244 while(len > 0) {
245 if (start >= ordered->file_offset + ordered->len ||
246 start < ordered->file_offset) {
247 sums->len = this_sum_bytes;
248 this_sum_bytes = 0;
249 btrfs_add_ordered_sum(inode, ordered, sums);
250 btrfs_put_ordered_extent(ordered);
251
252 sums = kzalloc(btrfs_ordered_sum_size(root, len),
253 GFP_NOFS);
254 BUG_ON(!sums);
255 sector_sum = sums->sums;
256 sums->len = len;
257 sums->file_offset = start;
258 ordered = btrfs_lookup_ordered_extent(inode,
259 sums->file_offset);
260 BUG_ON(!ordered);
261 }
262
263 page = find_get_page(inode->i_mapping,
264 start >> PAGE_CACHE_SHIFT);
265
266 data = kmap_atomic(page, KM_USER0);
267 sector_sum->sum = ~(u32)0;
268 sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
269 PAGE_CACHE_SIZE);
270 kunmap_atomic(data, KM_USER0);
271 btrfs_csum_final(sector_sum->sum,
272 (char *)&sector_sum->sum);
273 sector_sum->offset = page_offset(page);
274 page_cache_release(page);
275
276 sector_sum++;
277 total_bytes += PAGE_CACHE_SIZE;
278 this_sum_bytes += PAGE_CACHE_SIZE;
279 start += PAGE_CACHE_SIZE;
280
281 WARN_ON(len < PAGE_CACHE_SIZE);
282 len -= PAGE_CACHE_SIZE;
283 }
284 btrfs_add_ordered_sum(inode, ordered, sums);
285 btrfs_put_ordered_extent(ordered);
286 return 0;
287}
288
216int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 289int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
217 struct bio *bio) 290 struct bio *bio)
218{ 291{
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add2..0aa15436590e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
95 } 95 }
96} 96}
97 97
98/* this does all the hard work for inserting an inline extent into
99 * the btree. Any existing inline extent is extended as required to make room,
100 * otherwise things are inserted as required into the btree
101 */
102static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root, struct inode *inode,
104 u64 offset, size_t size,
105 struct page **pages, size_t page_offset,
106 int num_pages)
107{
108 struct btrfs_key key;
109 struct btrfs_path *path;
110 struct extent_buffer *leaf;
111 char *kaddr;
112 unsigned long ptr;
113 struct btrfs_file_extent_item *ei;
114 struct page *page;
115 u32 datasize;
116 int err = 0;
117 int ret;
118 int i;
119 ssize_t cur_size;
120
121 path = btrfs_alloc_path();
122 if (!path)
123 return -ENOMEM;
124
125 btrfs_set_trans_block_group(trans, inode);
126
127 key.objectid = inode->i_ino;
128 key.offset = offset;
129 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
130
131 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
132 if (ret < 0) {
133 err = ret;
134 goto fail;
135 }
136 if (ret == 1) {
137 struct btrfs_key found_key;
138
139 if (path->slots[0] == 0)
140 goto insert;
141
142 path->slots[0]--;
143 leaf = path->nodes[0];
144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
145
146 if (found_key.objectid != inode->i_ino)
147 goto insert;
148
149 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
150 goto insert;
151 ei = btrfs_item_ptr(leaf, path->slots[0],
152 struct btrfs_file_extent_item);
153
154 if (btrfs_file_extent_type(leaf, ei) !=
155 BTRFS_FILE_EXTENT_INLINE) {
156 goto insert;
157 }
158 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
159 ret = 0;
160 }
161 if (ret == 0) {
162 u32 found_size;
163 u64 found_end;
164
165 leaf = path->nodes[0];
166 ei = btrfs_item_ptr(leaf, path->slots[0],
167 struct btrfs_file_extent_item);
168
169 if (btrfs_file_extent_type(leaf, ei) !=
170 BTRFS_FILE_EXTENT_INLINE) {
171 err = ret;
172 btrfs_print_leaf(root, leaf);
173 printk("found wasn't inline offset %Lu inode %lu\n",
174 offset, inode->i_ino);
175 goto fail;
176 }
177 found_size = btrfs_file_extent_inline_len(leaf,
178 btrfs_item_nr(leaf, path->slots[0]));
179 found_end = key.offset + found_size;
180
181 if (found_end < offset + size) {
182 btrfs_release_path(root, path);
183 ret = btrfs_search_slot(trans, root, &key, path,
184 offset + size - found_end, 1);
185 BUG_ON(ret != 0);
186
187 ret = btrfs_extend_item(trans, root, path,
188 offset + size - found_end);
189 if (ret) {
190 err = ret;
191 goto fail;
192 }
193 leaf = path->nodes[0];
194 ei = btrfs_item_ptr(leaf, path->slots[0],
195 struct btrfs_file_extent_item);
196 inode_add_bytes(inode, offset + size - found_end);
197 }
198 if (found_end < offset) {
199 ptr = btrfs_file_extent_inline_start(ei) + found_size;
200 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
201 }
202 } else {
203insert:
204 btrfs_release_path(root, path);
205 datasize = offset + size - key.offset;
206 inode_add_bytes(inode, datasize);
207 datasize = btrfs_file_extent_calc_inline_size(datasize);
208 ret = btrfs_insert_empty_item(trans, root, path, &key,
209 datasize);
210 if (ret) {
211 err = ret;
212 printk("got bad ret %d\n", ret);
213 goto fail;
214 }
215 leaf = path->nodes[0];
216 ei = btrfs_item_ptr(leaf, path->slots[0],
217 struct btrfs_file_extent_item);
218 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
219 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
220 }
221 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
222
223 cur_size = size;
224 i = 0;
225 while (size > 0) {
226 page = pages[i];
227 kaddr = kmap_atomic(page, KM_USER0);
228 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
229 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
230 kunmap_atomic(kaddr, KM_USER0);
231 page_offset = 0;
232 ptr += cur_size;
233 size -= cur_size;
234 if (i >= num_pages) {
235 printk("i %d num_pages %d\n", i, num_pages);
236 }
237 i++;
238 }
239 btrfs_mark_buffer_dirty(leaf);
240fail:
241 btrfs_free_path(path);
242 return err;
243}
244
245/* 98/*
246 * after copy_from_user, pages need to be dirtied and we need to make 99 * after copy_from_user, pages need to be dirtied and we need to make
247 * sure holes are created between the current EOF and the start of 100 * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
267 u64 start_pos; 120 u64 start_pos;
268 u64 end_of_last_block; 121 u64 end_of_last_block;
269 u64 end_pos = pos + write_bytes; 122 u64 end_pos = pos + write_bytes;
270 u64 inline_size;
271 int did_inline = 0;
272 loff_t isize = i_size_read(inode); 123 loff_t isize = i_size_read(inode);
273 124
274 start_pos = pos & ~((u64)root->sectorsize - 1); 125 start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
314 err = btrfs_insert_file_extent(trans, root, 165 err = btrfs_insert_file_extent(trans, root,
315 inode->i_ino, 166 inode->i_ino,
316 last_pos_in_file, 167 last_pos_in_file,
317 0, 0, hole_size, 0); 168 0, 0, hole_size, 0,
169 hole_size, 0, 0, 0);
318 btrfs_drop_extent_cache(inode, last_pos_in_file, 170 btrfs_drop_extent_cache(inode, last_pos_in_file,
319 last_pos_in_file + hole_size - 1, 0); 171 last_pos_in_file + hole_size - 1, 0);
320 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 172 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
324 goto failed; 176 goto failed;
325 } 177 }
326 178
327 /* 179 /* check for reserved extents on each page, we don't want
328 * either allocate an extent for the new bytes or setup the key 180 * to reset the delalloc bit on things that already have
329 * to show we are doing inline data in the extent 181 * extents reserved.
330 */ 182 */
331 inline_size = end_pos; 183 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
332 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 184 for (i = 0; i < num_pages; i++) {
333 inline_size > root->fs_info->max_inline || 185 struct page *p = pages[i];
334 (inline_size & (root->sectorsize -1)) == 0 || 186 SetPageUptodate(p);
335 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { 187 ClearPageChecked(p);
336 /* check for reserved extents on each page, we don't want 188 set_page_dirty(p);
337 * to reset the delalloc bit on things that already have
338 * extents reserved.
339 */
340 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
341 for (i = 0; i < num_pages; i++) {
342 struct page *p = pages[i];
343 SetPageUptodate(p);
344 ClearPageChecked(p);
345 set_page_dirty(p);
346 }
347 } else {
348 u64 aligned_end;
349 /* step one, delete the existing extents in this range */
350 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
351 ~((u64)root->sectorsize - 1);
352 mutex_lock(&BTRFS_I(inode)->extent_mutex);
353 err = btrfs_drop_extents(trans, root, inode, start_pos,
354 aligned_end, aligned_end, &hint_byte);
355 if (err)
356 goto failed;
357 if (isize > inline_size)
358 inline_size = min_t(u64, isize, aligned_end);
359 inline_size -= start_pos;
360 err = insert_inline_extent(trans, root, inode, start_pos,
361 inline_size, pages, 0, num_pages);
362 btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
363 BUG_ON(err);
364 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
365
366 /*
367 * an ugly way to do all the prop accounting around
368 * the page bits and mapping tags
369 */
370 set_page_writeback(pages[0]);
371 end_page_writeback(pages[0]);
372 did_inline = 1;
373 } 189 }
374 if (end_pos > isize) { 190 if (end_pos > isize) {
375 i_size_write(inode, end_pos); 191 i_size_write(inode, end_pos);
376 if (did_inline)
377 BTRFS_I(inode)->disk_i_size = end_pos;
378 btrfs_update_inode(trans, root, inode); 192 btrfs_update_inode(trans, root, inode);
379 } 193 }
380failed: 194failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
399 int ret; 213 int ret;
400 int testend = 1; 214 int testend = 1;
401 unsigned long flags; 215 unsigned long flags;
216 int compressed = 0;
402 217
403 WARN_ON(end < start); 218 WARN_ON(end < start);
404 if (end == (u64)-1) { 219 if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
434 free_extent_map(em); 249 free_extent_map(em);
435 continue; 250 continue;
436 } 251 }
252 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
437 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 253 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
438 remove_extent_mapping(em_tree, em); 254 remove_extent_mapping(em_tree, em);
439 255
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
442 split->start = em->start; 258 split->start = em->start;
443 split->len = start - em->start; 259 split->len = start - em->start;
444 split->block_start = em->block_start; 260 split->block_start = em->block_start;
261
262 if (compressed)
263 split->block_len = em->block_len;
264 else
265 split->block_len = split->len;
266
445 split->bdev = em->bdev; 267 split->bdev = em->bdev;
446 split->flags = flags; 268 split->flags = flags;
447 ret = add_extent_mapping(em_tree, split); 269 ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
459 split->bdev = em->bdev; 281 split->bdev = em->bdev;
460 split->flags = flags; 282 split->flags = flags;
461 283
462 split->block_start = em->block_start + diff; 284 if (compressed) {
285 split->block_len = em->block_len;
286 split->block_start = em->block_start;
287 } else {
288 split->block_len = split->len;
289 split->block_start = em->block_start + diff;
290 }
463 291
464 ret = add_extent_mapping(em_tree, split); 292 ret = add_extent_mapping(em_tree, split);
465 BUG_ON(ret); 293 BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
533 struct btrfs_item *item; 361 struct btrfs_item *item;
534 item = btrfs_item_nr(leaf, slot); 362 item = btrfs_item_nr(leaf, slot);
535 extent_end = found_key.offset + 363 extent_end = found_key.offset +
536 btrfs_file_extent_inline_len(leaf, item); 364 btrfs_file_extent_inline_len(leaf, extent);
537 extent_end = (extent_end + root->sectorsize - 1) & 365 extent_end = (extent_end + root->sectorsize - 1) &
538 ~((u64)root->sectorsize -1 ); 366 ~((u64)root->sectorsize -1 );
539 } 367 }
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
573 u64 extent_end = 0; 401 u64 extent_end = 0;
574 u64 search_start = start; 402 u64 search_start = start;
575 u64 leaf_start; 403 u64 leaf_start;
404 u64 ram_bytes = 0;
405 u8 compression = 0;
406 u8 encryption = 0;
407 u16 other_encoding = 0;
576 u64 root_gen; 408 u64 root_gen;
577 u64 root_owner; 409 u64 root_owner;
578 struct extent_buffer *leaf; 410 struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
589 int recow; 421 int recow;
590 int ret; 422 int ret;
591 423
424 inline_limit = 0;
592 btrfs_drop_extent_cache(inode, start, end - 1, 0); 425 btrfs_drop_extent_cache(inode, start, end - 1, 0);
593 426
594 path = btrfs_alloc_path(); 427 path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
637 extent = btrfs_item_ptr(leaf, slot, 470 extent = btrfs_item_ptr(leaf, slot,
638 struct btrfs_file_extent_item); 471 struct btrfs_file_extent_item);
639 found_type = btrfs_file_extent_type(leaf, extent); 472 found_type = btrfs_file_extent_type(leaf, extent);
473 compression = btrfs_file_extent_compression(leaf,
474 extent);
475 encryption = btrfs_file_extent_encryption(leaf,
476 extent);
477 other_encoding = btrfs_file_extent_other_encoding(leaf,
478 extent);
640 if (found_type == BTRFS_FILE_EXTENT_REG) { 479 if (found_type == BTRFS_FILE_EXTENT_REG) {
641 extent_end = 480 extent_end =
642 btrfs_file_extent_disk_bytenr(leaf, 481 btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
646 485
647 extent_end = key.offset + 486 extent_end = key.offset +
648 btrfs_file_extent_num_bytes(leaf, extent); 487 btrfs_file_extent_num_bytes(leaf, extent);
488 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
489 extent);
649 found_extent = 1; 490 found_extent = 1;
650 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 491 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
651 struct btrfs_item *item;
652 item = btrfs_item_nr(leaf, slot);
653 found_inline = 1; 492 found_inline = 1;
654 extent_end = key.offset + 493 extent_end = key.offset +
655 btrfs_file_extent_inline_len(leaf, item); 494 btrfs_file_extent_inline_len(leaf, extent);
656 } 495 }
657 } else { 496 } else {
658 extent_end = search_start; 497 extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
680 search_start = (extent_end + mask) & ~mask; 519 search_start = (extent_end + mask) & ~mask;
681 } else 520 } else
682 search_start = extent_end; 521 search_start = extent_end;
683 if (end <= extent_end && start >= key.offset && found_inline) { 522
523 if (end <= extent_end && start >= key.offset && found_inline)
684 *hint_byte = EXTENT_MAP_INLINE; 524 *hint_byte = EXTENT_MAP_INLINE;
685 goto out;
686 }
687 525
688 if (found_extent) { 526 if (found_extent) {
689 read_extent_buffer(leaf, &old, (unsigned long)extent, 527 read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
770 write_extent_buffer(leaf, &old, 608 write_extent_buffer(leaf, &old,
771 (unsigned long)extent, sizeof(old)); 609 (unsigned long)extent, sizeof(old));
772 610
611 btrfs_set_file_extent_compression(leaf, extent,
612 compression);
613 btrfs_set_file_extent_encryption(leaf, extent,
614 encryption);
615 btrfs_set_file_extent_other_encoding(leaf, extent,
616 other_encoding);
773 btrfs_set_file_extent_offset(leaf, extent, 617 btrfs_set_file_extent_offset(leaf, extent,
774 le64_to_cpu(old.offset) + end - key.offset); 618 le64_to_cpu(old.offset) + end - key.offset);
775 WARN_ON(le64_to_cpu(old.num_bytes) < 619 WARN_ON(le64_to_cpu(old.num_bytes) <
776 (extent_end - end)); 620 (extent_end - end));
777 btrfs_set_file_extent_num_bytes(leaf, extent, 621 btrfs_set_file_extent_num_bytes(leaf, extent,
778 extent_end - end); 622 extent_end - end);
623
624 /*
625 * set the ram bytes to the size of the full extent
626 * before splitting. This is a worst case flag,
627 * but its the best we can do because we don't know
628 * how splitting affects compression
629 */
630 btrfs_set_file_extent_ram_bytes(leaf, extent,
631 ram_bytes);
779 btrfs_set_file_extent_type(leaf, extent, 632 btrfs_set_file_extent_type(leaf, extent,
780 BTRFS_FILE_EXTENT_REG); 633 BTRFS_FILE_EXTENT_REG);
781 634
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d6..9797592dc86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
49#include "compat.h" 49#include "compat.h"
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h" 51#include "ref-cache.h"
52#include "compression.h"
52 53
53struct btrfs_iget_args { 54struct btrfs_iget_args {
54 u64 ino; 55 u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
83}; 84};
84 85
85static void btrfs_truncate(struct inode *inode); 86static void btrfs_truncate(struct inode *inode);
87static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
86 88
87/* 89/*
88 * a very lame attempt at stopping writes when the FS is 85% full. There 90 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
114} 116}
115 117
116/* 118/*
119 * this does all the hard work for inserting an inline extent into
120 * the btree. The caller should have done a btrfs_drop_extents so that
121 * no overlapping inline items exist in the btree
122 */
123static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
124 struct btrfs_root *root, struct inode *inode,
125 u64 start, size_t size, size_t compressed_size,
126 struct page **compressed_pages)
127{
128 struct btrfs_key key;
129 struct btrfs_path *path;
130 struct extent_buffer *leaf;
131 struct page *page = NULL;
132 char *kaddr;
133 unsigned long ptr;
134 struct btrfs_file_extent_item *ei;
135 int err = 0;
136 int ret;
137 size_t cur_size = size;
138 size_t datasize;
139 unsigned long offset;
140 int use_compress = 0;
141
142 if (compressed_size && compressed_pages) {
143 use_compress = 1;
144 cur_size = compressed_size;
145 }
146
147 path = btrfs_alloc_path(); if (!path)
148 return -ENOMEM;
149
150 btrfs_set_trans_block_group(trans, inode);
151
152 key.objectid = inode->i_ino;
153 key.offset = start;
154 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
155 inode_add_bytes(inode, size);
156 datasize = btrfs_file_extent_calc_inline_size(cur_size);
157
158 inode_add_bytes(inode, size);
159 ret = btrfs_insert_empty_item(trans, root, path, &key,
160 datasize);
161 BUG_ON(ret);
162 if (ret) {
163 err = ret;
164 printk("got bad ret %d\n", ret);
165 goto fail;
166 }
167 leaf = path->nodes[0];
168 ei = btrfs_item_ptr(leaf, path->slots[0],
169 struct btrfs_file_extent_item);
170 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172 btrfs_set_file_extent_encryption(leaf, ei, 0);
173 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175 ptr = btrfs_file_extent_inline_start(ei);
176
177 if (use_compress) {
178 struct page *cpage;
179 int i = 0;
180 while(compressed_size > 0) {
181 cpage = compressed_pages[i];
182 cur_size = min(compressed_size,
183 PAGE_CACHE_SIZE);
184
185 kaddr = kmap(cpage);
186 write_extent_buffer(leaf, kaddr, ptr, cur_size);
187 kunmap(cpage);
188
189 i++;
190 ptr += cur_size;
191 compressed_size -= cur_size;
192 }
193 btrfs_set_file_extent_compression(leaf, ei,
194 BTRFS_COMPRESS_ZLIB);
195 } else {
196 page = find_get_page(inode->i_mapping,
197 start >> PAGE_CACHE_SHIFT);
198 btrfs_set_file_extent_compression(leaf, ei, 0);
199 kaddr = kmap_atomic(page, KM_USER0);
200 offset = start & (PAGE_CACHE_SIZE - 1);
201 write_extent_buffer(leaf, kaddr + offset, ptr, size);
202 kunmap_atomic(kaddr, KM_USER0);
203 page_cache_release(page);
204 }
205 btrfs_mark_buffer_dirty(leaf);
206 btrfs_free_path(path);
207
208 BTRFS_I(inode)->disk_i_size = inode->i_size;
209 btrfs_update_inode(trans, root, inode);
210 return 0;
211fail:
212 btrfs_free_path(path);
213 return err;
214}
215
216
217/*
218 * conditionally insert an inline extent into the file. This
219 * does the checks required to make sure the data is small enough
220 * to fit as an inline extent.
221 */
222static int cow_file_range_inline(struct btrfs_trans_handle *trans,
223 struct btrfs_root *root,
224 struct inode *inode, u64 start, u64 end,
225 size_t compressed_size,
226 struct page **compressed_pages)
227{
228 u64 isize = i_size_read(inode);
229 u64 actual_end = min(end + 1, isize);
230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len;
235 int ret;
236
237 if (compressed_size)
238 data_len = compressed_size;
239
240 if (start > 0 ||
241 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
242 (!compressed_size &&
243 (actual_end & (root->sectorsize - 1)) == 0) ||
244 end + 1 < isize ||
245 data_len > root->fs_info->max_inline) {
246 return 1;
247 }
248
249 mutex_lock(&BTRFS_I(inode)->extent_mutex);
250 ret = btrfs_drop_extents(trans, root, inode, start,
251 aligned_end, aligned_end, &hint_byte);
252 BUG_ON(ret);
253
254 if (isize > actual_end)
255 inline_len = min_t(u64, isize, actual_end);
256 ret = insert_inline_extent(trans, root, inode, start,
257 inline_len, compressed_size,
258 compressed_pages);
259 BUG_ON(ret);
260 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
261 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
262 return 0;
263}
264
265/*
117 * when extent_io.c finds a delayed allocation range in the file, 266 * when extent_io.c finds a delayed allocation range in the file,
118 * the call backs end up in this code. The basic idea is to 267 * the call backs end up in this code. The basic idea is to
119 * allocate extents on disk for the range, and create ordered data structs 268 * allocate extents on disk for the range, and create ordered data structs
120 * in ram to track those extents. 269 * in ram to track those extents.
270 *
271 * locked_page is the page that writepage had locked already. We use
272 * it to make sure we don't do extra locks or unlocks.
273 *
274 * *page_started is set to one if we unlock locked_page and do everything
275 * required to start IO on it. It may be clean and already done with
276 * IO when we return.
121 */ 277 */
122static int cow_file_range(struct inode *inode, u64 start, u64 end) 278static int cow_file_range(struct inode *inode, struct page *locked_page,
279 u64 start, u64 end, int *page_started)
123{ 280{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 281 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct btrfs_trans_handle *trans; 282 struct btrfs_trans_handle *trans;
126 u64 alloc_hint = 0; 283 u64 alloc_hint = 0;
127 u64 num_bytes; 284 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start;
287 u64 disk_num_bytes;
128 u64 cur_alloc_size; 288 u64 cur_alloc_size;
129 u64 blocksize = root->sectorsize; 289 u64 blocksize = root->sectorsize;
130 u64 orig_num_bytes; 290 u64 actual_end;
131 struct btrfs_key ins; 291 struct btrfs_key ins;
132 struct extent_map *em; 292 struct extent_map *em;
133 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
134 int ret = 0; 294 int ret = 0;
295 struct page **pages = NULL;
296 unsigned long nr_pages;
297 unsigned long nr_pages_ret = 0;
298 unsigned long total_compressed = 0;
299 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024;
302 int i;
303 int will_compress;
135 304
136 trans = btrfs_join_transaction(root, 1); 305 trans = btrfs_join_transaction(root, 1);
137 BUG_ON(!trans); 306 BUG_ON(!trans);
138 btrfs_set_trans_block_group(trans, inode); 307 btrfs_set_trans_block_group(trans, inode);
308 orig_start = start;
309
310 /*
311 * compression made this loop a bit ugly, but the basic idea is to
312 * compress some pages but keep the total size of the compressed
313 * extent relatively small. If compression is off, this goto target
314 * is never used.
315 */
316again:
317 will_compress = 0;
318 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
319 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
139 320
321 actual_end = min_t(u64, i_size_read(inode), end + 1);
322 total_compressed = actual_end - start;
323
324 /* we want to make sure that amount of ram required to uncompress
325 * an extent is reasonable, so we limit the total size in ram
326 * of a compressed extent to 256k
327 */
328 total_compressed = min(total_compressed, max_uncompressed);
140 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 329 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
141 num_bytes = max(blocksize, num_bytes); 330 num_bytes = max(blocksize, num_bytes);
142 orig_num_bytes = num_bytes; 331 disk_num_bytes = num_bytes;
332 total_in = 0;
333 ret = 0;
143 334
144 if (alloc_hint == EXTENT_MAP_INLINE) 335 /* we do compression for mount -o compress and when the
145 goto out; 336 * inode has not been flagged as nocompress
337 */
338 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
339 btrfs_test_opt(root, COMPRESS)) {
340 WARN_ON(pages);
341 pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
342
343 /* we want to make sure the amount of IO required to satisfy
344 * a random read is reasonably small, so we limit the size
345 * of a compressed extent to 128k
346 */
347 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
348 total_compressed, pages,
349 nr_pages, &nr_pages_ret,
350 &total_in,
351 &total_compressed,
352 max_compressed);
353
354 if (!ret) {
355 unsigned long offset = total_compressed &
356 (PAGE_CACHE_SIZE - 1);
357 struct page *page = pages[nr_pages_ret - 1];
358 char *kaddr;
359
360 /* zero the tail end of the last page, we might be
361 * sending it down to disk
362 */
363 if (offset) {
364 kaddr = kmap_atomic(page, KM_USER0);
365 memset(kaddr + offset, 0,
366 PAGE_CACHE_SIZE - offset);
367 kunmap_atomic(kaddr, KM_USER0);
368 }
369 will_compress = 1;
370 }
371 }
372 if (start == 0) {
373 /* lets try to make an inline extent */
374 if (ret || total_in < (end - start + 1)) {
375 /* we didn't compress the entire range, try
376 * to make an uncompressed inline extent. This
377 * is almost sure to fail, but maybe inline sizes
378 * will get bigger later
379 */
380 ret = cow_file_range_inline(trans, root, inode,
381 start, end, 0, NULL);
382 } else {
383 ret = cow_file_range_inline(trans, root, inode,
384 start, end,
385 total_compressed, pages);
386 }
387 if (ret == 0) {
388 extent_clear_unlock_delalloc(inode,
389 &BTRFS_I(inode)->io_tree,
390 start, end, NULL,
391 1, 1, 1);
392 *page_started = 1;
393 ret = 0;
394 goto free_pages_out;
395 }
396 }
397
398 if (will_compress) {
399 /*
400 * we aren't doing an inline extent round the compressed size
401 * up to a block size boundary so the allocator does sane
402 * things
403 */
404 total_compressed = (total_compressed + blocksize - 1) &
405 ~(blocksize - 1);
406
407 /*
408 * one last check to make sure the compression is really a
409 * win, compare the page count read with the blocks on disk
410 */
411 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
412 ~(PAGE_CACHE_SIZE - 1);
413 if (total_compressed >= total_in) {
414 will_compress = 0;
415 } else {
416 disk_num_bytes = total_compressed;
417 num_bytes = total_in;
418 }
419 }
420 if (!will_compress && pages) {
421 /*
422 * the compression code ran but failed to make things smaller,
423 * free any pages it allocated and our page pointer array
424 */
425 for (i = 0; i < nr_pages_ret; i++) {
426 page_cache_release(pages[i]);
427 }
428 kfree(pages);
429 pages = NULL;
430 total_compressed = 0;
431 nr_pages_ret = 0;
432
433 /* flag the file so we don't compress in the future */
434 btrfs_set_flag(inode, NOCOMPRESS);
435 }
436
437 BUG_ON(disk_num_bytes >
438 btrfs_super_total_bytes(&root->fs_info->super_copy));
146 439
147 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
148 mutex_lock(&BTRFS_I(inode)->extent_mutex); 440 mutex_lock(&BTRFS_I(inode)->extent_mutex);
149 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 441 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
150 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 442 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
151 443
152 while(num_bytes > 0) { 444 while(disk_num_bytes > 0) {
153 cur_alloc_size = min(num_bytes, root->fs_info->max_extent); 445 unsigned long min_bytes;
446
447 /*
448 * the max size of a compressed extent is pretty small,
449 * make the code a little less complex by forcing
450 * the allocator to find a whole compressed extent at once
451 */
452 if (will_compress)
453 min_bytes = disk_num_bytes;
454 else
455 min_bytes = root->sectorsize;
456
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
154 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
155 root->sectorsize, 0, alloc_hint, 459 min_bytes, 0, alloc_hint,
156 (u64)-1, &ins, 1); 460 (u64)-1, &ins, 1);
157 if (ret) { 461 if (ret) {
158 WARN_ON(1); 462 WARN_ON(1);
159 goto out; 463 goto free_pages_out_fail;
160 } 464 }
161 em = alloc_extent_map(GFP_NOFS); 465 em = alloc_extent_map(GFP_NOFS);
162 em->start = start; 466 em->start = start;
163 em->len = ins.offset; 467
468 if (will_compress) {
469 ram_size = num_bytes;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476
164 em->block_start = ins.objectid; 477 em->block_start = ins.objectid;
478 em->block_len = ins.offset;
165 em->bdev = root->fs_info->fs_devices->latest_bdev; 479 em->bdev = root->fs_info->fs_devices->latest_bdev;
480
166 mutex_lock(&BTRFS_I(inode)->extent_mutex); 481 mutex_lock(&BTRFS_I(inode)->extent_mutex);
167 set_bit(EXTENT_FLAG_PINNED, &em->flags); 482 set_bit(EXTENT_FLAG_PINNED, &em->flags);
483
484 if (will_compress)
485 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
486
168 while(1) { 487 while(1) {
169 spin_lock(&em_tree->lock); 488 spin_lock(&em_tree->lock);
170 ret = add_extent_mapping(em_tree, em); 489 ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
174 break; 493 break;
175 } 494 }
176 btrfs_drop_extent_cache(inode, start, 495 btrfs_drop_extent_cache(inode, start,
177 start + ins.offset - 1, 0); 496 start + ram_size - 1, 0);
178 } 497 }
179 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 498 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
180 499
181 cur_alloc_size = ins.offset; 500 cur_alloc_size = ins.offset;
182 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 501 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
183 ins.offset, 0); 502 ram_size, cur_alloc_size, 0,
503 will_compress);
184 BUG_ON(ret); 504 BUG_ON(ret);
185 if (num_bytes < cur_alloc_size) { 505
186 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, 506 if (disk_num_bytes < cur_alloc_size) {
507 printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
187 cur_alloc_size); 508 cur_alloc_size);
188 break; 509 break;
189 } 510 }
511
512 if (will_compress) {
513 /*
514 * we're doing compression, we and we need to
515 * submit the compressed extents down to the device.
516 *
517 * We lock down all the file pages, clearing their
518 * dirty bits and setting them writeback. Everyone
519 * that wants to modify the page will wait on the
520 * ordered extent above.
521 *
522 * The writeback bits on the file pages are
523 * cleared when the compressed pages are on disk
524 */
525 btrfs_end_transaction(trans, root);
526
527 if (start <= page_offset(locked_page) &&
528 page_offset(locked_page) < start + ram_size) {
529 *page_started = 1;
530 }
531
532 extent_clear_unlock_delalloc(inode,
533 &BTRFS_I(inode)->io_tree,
534 start,
535 start + ram_size - 1,
536 NULL, 1, 1, 0);
537
538 ret = btrfs_submit_compressed_write(inode, start,
539 ram_size, ins.objectid,
540 cur_alloc_size, pages,
541 nr_pages_ret);
542
543 BUG_ON(ret);
544 trans = btrfs_join_transaction(root, 1);
545 if (start + ram_size < end) {
546 start += ram_size;
547 alloc_hint = ins.objectid + ins.offset;
548 /* pages will be freed at end_bio time */
549 pages = NULL;
550 goto again;
551 } else {
552 /* we've written everything, time to go */
553 break;
554 }
555 }
556 /* we're not doing compressed IO, don't unlock the first
557 * page (which the caller expects to stay locked), don't
558 * clear any dirty bits and don't set any writeback bits
559 */
560 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
561 start, start + ram_size - 1,
562 locked_page, 0, 0, 0);
563 disk_num_bytes -= cur_alloc_size;
190 num_bytes -= cur_alloc_size; 564 num_bytes -= cur_alloc_size;
191 alloc_hint = ins.objectid + ins.offset; 565 alloc_hint = ins.objectid + ins.offset;
192 start += cur_alloc_size; 566 start += cur_alloc_size;
193 } 567 }
568
569 ret = 0;
194out: 570out:
195 btrfs_end_transaction(trans, root); 571 btrfs_end_transaction(trans, root);
572
196 return ret; 573 return ret;
574
575free_pages_out_fail:
576 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
577 start, end, locked_page, 0, 0, 0);
578free_pages_out:
579 for (i = 0; i < nr_pages_ret; i++)
580 page_cache_release(pages[i]);
581 if (pages)
582 kfree(pages);
583
584 goto out;
197} 585}
198 586
199/* 587/*
@@ -203,7 +591,8 @@ out:
203 * If no cow copies or snapshots exist, we write directly to the existing 591 * If no cow copies or snapshots exist, we write directly to the existing
204 * blocks on disk 592 * blocks on disk
205 */ 593 */
206static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) 594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started)
207{ 596{
208 u64 extent_start; 597 u64 extent_start;
209 u64 extent_end; 598 u64 extent_end;
@@ -260,6 +649,11 @@ again:
260 extent_end = extent_start + extent_num_bytes; 649 extent_end = extent_start + extent_num_bytes;
261 err = 0; 650 err = 0;
262 651
652 if (btrfs_file_extent_compression(leaf, item) ||
653 btrfs_file_extent_encryption(leaf,item) ||
654 btrfs_file_extent_other_encoding(leaf, item))
655 goto not_found;
656
263 if (loops && start != extent_start) 657 if (loops && start != extent_start)
264 goto not_found; 658 goto not_found;
265 659
@@ -284,7 +678,8 @@ again:
284 bytenr += btrfs_file_extent_offset(leaf, item); 678 bytenr += btrfs_file_extent_offset(leaf, item);
285 extent_num_bytes = min(end + 1, extent_end) - start; 679 extent_num_bytes = min(end + 1, extent_end) - start;
286 ret = btrfs_add_ordered_extent(inode, start, bytenr, 680 ret = btrfs_add_ordered_extent(inode, start, bytenr,
287 extent_num_bytes, 1); 681 extent_num_bytes,
682 extent_num_bytes, 1, 0);
288 if (ret) { 683 if (ret) {
289 err = ret; 684 err = ret;
290 goto out; 685 goto out;
@@ -300,7 +695,8 @@ again:
300not_found: 695not_found:
301 btrfs_end_transaction(trans, root); 696 btrfs_end_transaction(trans, root);
302 btrfs_free_path(path); 697 btrfs_free_path(path);
303 return cow_file_range(inode, start, end); 698 return cow_file_range(inode, locked_page, start, end,
699 page_started);
304 } 700 }
305out: 701out:
306 WARN_ON(err); 702 WARN_ON(err);
@@ -312,16 +708,19 @@ out:
312/* 708/*
313 * extent_io.c call back to do delayed allocation processing 709 * extent_io.c call back to do delayed allocation processing
314 */ 710 */
315static int run_delalloc_range(struct inode *inode, u64 start, u64 end) 711static int run_delalloc_range(struct inode *inode, struct page *locked_page,
712 u64 start, u64 end, int *page_started)
316{ 713{
317 struct btrfs_root *root = BTRFS_I(inode)->root; 714 struct btrfs_root *root = BTRFS_I(inode)->root;
318 int ret; 715 int ret;
319 716
320 if (btrfs_test_opt(root, NODATACOW) || 717 if (btrfs_test_opt(root, NODATACOW) ||
321 btrfs_test_flag(inode, NODATACOW)) 718 btrfs_test_flag(inode, NODATACOW))
322 ret = run_delalloc_nocow(inode, start, end); 719 ret = run_delalloc_nocow(inode, locked_page, start, end,
720 page_started);
323 else 721 else
324 ret = cow_file_range(inode, start, end); 722 ret = cow_file_range(inode, locked_page, start, end,
723 page_started);
325 724
326 return ret; 725 return ret;
327} 726}
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
383 * we don't create bios that span stripes or chunks 782 * we don't create bios that span stripes or chunks
384 */ 783 */
385int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 784int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
386 size_t size, struct bio *bio) 785 size_t size, struct bio *bio,
786 unsigned long bio_flags)
387{ 787{
388 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 788 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
389 struct btrfs_mapping_tree *map_tree; 789 struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
413 * are inserted into the btree 813 * are inserted into the btree
414 */ 814 */
415int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 815int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
416 int mirror_num) 816 int mirror_num, unsigned long bio_flags)
417{ 817{
418 struct btrfs_root *root = BTRFS_I(inode)->root; 818 struct btrfs_root *root = BTRFS_I(inode)->root;
419 int ret = 0; 819 int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
429 * or reading the csums from the tree before a read 829 * or reading the csums from the tree before a read
430 */ 830 */
431int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 831int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
432 int mirror_num) 832 int mirror_num, unsigned long bio_flags)
433{ 833{
434 struct btrfs_root *root = BTRFS_I(inode)->root; 834 struct btrfs_root *root = BTRFS_I(inode)->root;
435 int ret = 0; 835 int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
444 844
445 if (!(rw & (1 << BIO_RW))) { 845 if (!(rw & (1 << BIO_RW))) {
446 btrfs_lookup_bio_sums(root, inode, bio); 846 btrfs_lookup_bio_sums(root, inode, bio);
847
848 if (bio_flags & EXTENT_BIO_COMPRESSED) {
849 return btrfs_submit_compressed_read(inode, bio,
850 mirror_num, bio_flags);
851 }
852
447 goto mapit; 853 goto mapit;
448 } 854 }
449 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 855 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
450 inode, rw, bio, mirror_num, 856 inode, rw, bio, mirror_num,
451 __btrfs_submit_bio_hook); 857 bio_flags, __btrfs_submit_bio_hook);
452mapit: 858mapit:
453 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 859 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
454} 860}
@@ -539,7 +945,7 @@ out_page:
539 * good idea. This causes problems because we want to make sure COW 945 * good idea. This causes problems because we want to make sure COW
540 * properly happens and the data=ordered rules are followed. 946 * properly happens and the data=ordered rules are followed.
541 * 947 *
542 * In our case any range that doesn't have the EXTENT_ORDERED bit set 948 * In our case any range that doesn't have the ORDERED bit set
543 * hasn't been properly setup for IO. We kick off an async process 949 * hasn't been properly setup for IO. We kick off an async process
544 * to fix it up. The async helper will wait for ordered extents, set 950 * to fix it up. The async helper will wait for ordered extents, set
545 * the delalloc bit and make it safe to write the page. 951 * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
632 btrfs_set_file_extent_disk_bytenr(leaf, extent_item, 1038 btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
633 ordered_extent->start); 1039 ordered_extent->start);
634 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, 1040 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
635 ordered_extent->len); 1041 ordered_extent->disk_len);
636 btrfs_set_file_extent_offset(leaf, extent_item, 0); 1042 btrfs_set_file_extent_offset(leaf, extent_item, 0);
1043
1044 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1045 btrfs_set_file_extent_compression(leaf, extent_item, 1);
1046 else
1047 btrfs_set_file_extent_compression(leaf, extent_item, 0);
1048 btrfs_set_file_extent_encryption(leaf, extent_item, 0);
1049 btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
1050
1051 /* ram bytes = extent_num_bytes for now */
637 btrfs_set_file_extent_num_bytes(leaf, extent_item, 1052 btrfs_set_file_extent_num_bytes(leaf, extent_item,
638 ordered_extent->len); 1053 ordered_extent->len);
1054 btrfs_set_file_extent_ram_bytes(leaf, extent_item,
1055 ordered_extent->len);
639 btrfs_mark_buffer_dirty(leaf); 1056 btrfs_mark_buffer_dirty(leaf);
640 1057
641 btrfs_drop_extent_cache(inode, ordered_extent->file_offset, 1058 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
644 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1061 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
645 1062
646 ins.objectid = ordered_extent->start; 1063 ins.objectid = ordered_extent->start;
647 ins.offset = ordered_extent->len; 1064 ins.offset = ordered_extent->disk_len;
648 ins.type = BTRFS_EXTENT_ITEM_KEY; 1065 ins.type = BTRFS_EXTENT_ITEM_KEY;
649 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1066 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
650 root->root_key.objectid, 1067 root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
714 int ret; 1131 int ret;
715 int rw; 1132 int rw;
716 u64 logical; 1133 u64 logical;
1134 unsigned long bio_flags = 0;
717 1135
718 ret = get_state_private(failure_tree, start, &private); 1136 ret = get_state_private(failure_tree, start, &private);
719 if (ret) { 1137 if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
738 } 1156 }
739 logical = start - em->start; 1157 logical = start - em->start;
740 logical = em->block_start + logical; 1158 logical = em->block_start + logical;
1159 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1160 bio_flags = EXTENT_BIO_COMPRESSED;
741 failrec->logical = logical; 1161 failrec->logical = logical;
742 free_extent_map(em); 1162 free_extent_map(em);
743 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1163 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
781 rw = READ; 1201 rw = READ;
782 1202
783 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1203 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
784 failrec->last_mirror); 1204 failrec->last_mirror,
1205 bio_flags);
785 return 0; 1206 return 0;
786} 1207}
787 1208
@@ -1644,10 +2065,8 @@ search_again:
1644 item_end += 2065 item_end +=
1645 btrfs_file_extent_num_bytes(leaf, fi); 2066 btrfs_file_extent_num_bytes(leaf, fi);
1646 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2067 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1647 struct btrfs_item *item = btrfs_item_nr(leaf,
1648 path->slots[0]);
1649 item_end += btrfs_file_extent_inline_len(leaf, 2068 item_end += btrfs_file_extent_inline_len(leaf,
1650 item); 2069 fi);
1651 } 2070 }
1652 item_end--; 2071 item_end--;
1653 } 2072 }
@@ -1715,7 +2134,14 @@ search_again:
1715 root_owner = btrfs_header_owner(leaf); 2134 root_owner = btrfs_header_owner(leaf);
1716 } 2135 }
1717 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2136 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1718 if (!del_item) { 2137 /*
2138 * we can't truncate inline items that have had
2139 * special encodings
2140 */
2141 if (!del_item &&
2142 btrfs_file_extent_compression(leaf, fi) == 0 &&
2143 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2144 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
1719 u32 size = new_size - found_key.offset; 2145 u32 size = new_size - found_key.offset;
1720 2146
1721 if (root->ref_cows) { 2147 if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1926 err = btrfs_insert_file_extent(trans, root, 2352 err = btrfs_insert_file_extent(trans, root,
1927 inode->i_ino, 2353 inode->i_ino,
1928 hole_start, 0, 0, 2354 hole_start, 0, 0,
1929 hole_size, 0); 2355 hole_size, 0, hole_size,
2356 0, 0, 0);
1930 btrfs_drop_extent_cache(inode, hole_start, 2357 btrfs_drop_extent_cache(inode, hole_start,
1931 (u64)-1, 0); 2358 (u64)-1, 0);
1932 btrfs_check_file(root, inode); 2359 btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
2894 start_diff = map_start - em->start; 3321 start_diff = map_start - em->start;
2895 em->start = map_start; 3322 em->start = map_start;
2896 em->len = map_len; 3323 em->len = map_len;
2897 if (em->block_start < EXTENT_MAP_LAST_BYTE) 3324 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3325 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2898 em->block_start += start_diff; 3326 em->block_start += start_diff;
3327 em->block_len -= start_diff;
3328 }
2899 return add_extent_mapping(em_tree, em); 3329 return add_extent_mapping(em_tree, em);
2900} 3330}
2901 3331
3332static noinline int uncompress_inline(struct btrfs_path *path,
3333 struct inode *inode, struct page *page,
3334 size_t pg_offset, u64 extent_offset,
3335 struct btrfs_file_extent_item *item)
3336{
3337 int ret;
3338 struct extent_buffer *leaf = path->nodes[0];
3339 char *tmp;
3340 size_t max_size;
3341 unsigned long inline_size;
3342 unsigned long ptr;
3343
3344 WARN_ON(pg_offset != 0);
3345 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3346 inline_size = btrfs_file_extent_inline_item_len(leaf,
3347 btrfs_item_nr(leaf, path->slots[0]));
3348 tmp = kmalloc(inline_size, GFP_NOFS);
3349 ptr = btrfs_file_extent_inline_start(item);
3350
3351 read_extent_buffer(leaf, tmp, ptr, inline_size);
3352
3353 max_size = min(PAGE_CACHE_SIZE, max_size);
3354 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3355 inline_size, max_size);
3356 if (ret) {
3357 char *kaddr = kmap_atomic(page, KM_USER0);
3358 unsigned long copy_size = min_t(u64,
3359 PAGE_CACHE_SIZE - pg_offset,
3360 max_size - extent_offset);
3361 memset(kaddr + pg_offset, 0, copy_size);
3362 kunmap_atomic(kaddr, KM_USER0);
3363 }
3364 kfree(tmp);
3365 return 0;
3366}
3367
2902/* 3368/*
2903 * a bit scary, this does extent mapping from logical file offset to the disk. 3369 * a bit scary, this does extent mapping from logical file offset to the disk.
2904 * the ugly parts come from merging extents from the disk with the 3370 * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2927 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3393 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2928 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3394 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2929 struct btrfs_trans_handle *trans = NULL; 3395 struct btrfs_trans_handle *trans = NULL;
3396 int compressed;
2930 3397
2931again: 3398again:
2932 spin_lock(&em_tree->lock); 3399 spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
2951 em->bdev = root->fs_info->fs_devices->latest_bdev; 3418 em->bdev = root->fs_info->fs_devices->latest_bdev;
2952 em->start = EXTENT_MAP_HOLE; 3419 em->start = EXTENT_MAP_HOLE;
2953 em->len = (u64)-1; 3420 em->len = (u64)-1;
3421 em->block_len = (u64)-1;
2954 3422
2955 if (!path) { 3423 if (!path) {
2956 path = btrfs_alloc_path(); 3424 path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
2983 3451
2984 found_type = btrfs_file_extent_type(leaf, item); 3452 found_type = btrfs_file_extent_type(leaf, item);
2985 extent_start = found_key.offset; 3453 extent_start = found_key.offset;
3454 compressed = btrfs_file_extent_compression(leaf, item);
2986 if (found_type == BTRFS_FILE_EXTENT_REG) { 3455 if (found_type == BTRFS_FILE_EXTENT_REG) {
2987 extent_end = extent_start + 3456 extent_end = extent_start +
2988 btrfs_file_extent_num_bytes(leaf, item); 3457 btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
3005 em->block_start = EXTENT_MAP_HOLE; 3474 em->block_start = EXTENT_MAP_HOLE;
3006 goto insert; 3475 goto insert;
3007 } 3476 }
3008 bytenr += btrfs_file_extent_offset(leaf, item);
3009 em->block_start = bytenr;
3010 em->start = extent_start; 3477 em->start = extent_start;
3011 em->len = extent_end - extent_start; 3478 em->len = extent_end - extent_start;
3479 if (compressed) {
3480 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3481 em->block_start = bytenr;
3482 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
3483 item);
3484 } else {
3485 bytenr += btrfs_file_extent_offset(leaf, item);
3486 em->block_start = bytenr;
3487 em->block_len = em->len;
3488 }
3012 goto insert; 3489 goto insert;
3013 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 3490 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3014 u64 page_start; 3491 u64 page_start;
@@ -3018,8 +3495,7 @@ again:
3018 size_t extent_offset; 3495 size_t extent_offset;
3019 size_t copy_size; 3496 size_t copy_size;
3020 3497
3021 size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, 3498 size = btrfs_file_extent_inline_len(leaf, item);
3022 path->slots[0]));
3023 extent_end = (extent_start + size + root->sectorsize - 1) & 3499 extent_end = (extent_start + size + root->sectorsize - 1) &
3024 ~((u64)root->sectorsize - 1); 3500 ~((u64)root->sectorsize - 1);
3025 if (start < extent_start || start >= extent_end) { 3501 if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
3035 } 3511 }
3036 em->block_start = EXTENT_MAP_INLINE; 3512 em->block_start = EXTENT_MAP_INLINE;
3037 3513
3038 if (!page) { 3514 if (!page || create) {
3039 em->start = extent_start; 3515 em->start = extent_start;
3040 em->len = size; 3516 em->len = (size + root->sectorsize - 1) &
3517 ~((u64)root->sectorsize - 1);
3041 goto out; 3518 goto out;
3042 } 3519 }
3043 3520
@@ -3048,11 +3525,22 @@ again:
3048 em->start = extent_start + extent_offset; 3525 em->start = extent_start + extent_offset;
3049 em->len = (copy_size + root->sectorsize - 1) & 3526 em->len = (copy_size + root->sectorsize - 1) &
3050 ~((u64)root->sectorsize - 1); 3527 ~((u64)root->sectorsize - 1);
3051 map = kmap(page); 3528 if (compressed)
3529 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3052 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 3530 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
3053 if (create == 0 && !PageUptodate(page)) { 3531 if (create == 0 && !PageUptodate(page)) {
3054 read_extent_buffer(leaf, map + pg_offset, ptr, 3532 if (btrfs_file_extent_compression(leaf, item) ==
3055 copy_size); 3533 BTRFS_COMPRESS_ZLIB) {
3534 ret = uncompress_inline(path, inode, page,
3535 pg_offset,
3536 extent_offset, item);
3537 BUG_ON(ret);
3538 } else {
3539 map = kmap(page);
3540 read_extent_buffer(leaf, map + pg_offset, ptr,
3541 copy_size);
3542 kunmap(page);
3543 }
3056 flush_dcache_page(page); 3544 flush_dcache_page(page);
3057 } else if (create && PageUptodate(page)) { 3545 } else if (create && PageUptodate(page)) {
3058 if (!trans) { 3546 if (!trans) {
@@ -3063,11 +3551,12 @@ again:
3063 trans = btrfs_join_transaction(root, 1); 3551 trans = btrfs_join_transaction(root, 1);
3064 goto again; 3552 goto again;
3065 } 3553 }
3554 map = kmap(page);
3066 write_extent_buffer(leaf, map + pg_offset, ptr, 3555 write_extent_buffer(leaf, map + pg_offset, ptr,
3067 copy_size); 3556 copy_size);
3557 kunmap(page);
3068 btrfs_mark_buffer_dirty(leaf); 3558 btrfs_mark_buffer_dirty(leaf);
3069 } 3559 }
3070 kunmap(page);
3071 set_extent_uptodate(io_tree, em->start, 3560 set_extent_uptodate(io_tree, em->start,
3072 extent_map_end(em) - 1, GFP_NOFS); 3561 extent_map_end(em) - 1, GFP_NOFS);
3073 goto insert; 3562 goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
3779 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 4268 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
3780 btrfs_set_file_extent_type(leaf, ei, 4269 btrfs_set_file_extent_type(leaf, ei,
3781 BTRFS_FILE_EXTENT_INLINE); 4270 BTRFS_FILE_EXTENT_INLINE);
4271 btrfs_set_file_extent_encryption(leaf, ei, 0);
4272 btrfs_set_file_extent_compression(leaf, ei, 0);
4273 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4274 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4275
3782 ptr = btrfs_file_extent_inline_start(ei); 4276 ptr = btrfs_file_extent_inline_start(ei);
3783 write_extent_buffer(leaf, symname, ptr, name_len); 4277 write_extent_buffer(leaf, symname, ptr, name_len);
3784 btrfs_mark_buffer_dirty(leaf); 4278 btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c2..b5745bb96d40 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
165 * inserted. 165 * inserted.
166 */ 166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, int nocow) 168 u64 start, u64 len, u64 disk_len, int nocow,
169 int compressed)
169{ 170{
170 struct btrfs_ordered_inode_tree *tree; 171 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node; 172 struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
180 entry->file_offset = file_offset; 181 entry->file_offset = file_offset;
181 entry->start = start; 182 entry->start = start;
182 entry->len = len; 183 entry->len = len;
184 entry->disk_len = disk_len;
183 entry->inode = inode; 185 entry->inode = inode;
184 if (nocow) 186 if (nocow)
185 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags); 187 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
188 if (compressed)
189 set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
186 190
187 /* one ref for the tree */ 191 /* one ref for the tree */
188 atomic_set(&entry->refs, 1); 192 atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
389 * for pdflush to find them 393 * for pdflush to find them
390 */ 394 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); 395 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
392 if (wait) 396 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 397 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags)); 398 &entry->flags));
399 }
395} 400}
396 401
397/* 402/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a144..1ef464145d22 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
66 66
67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
68 68
69#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
70
69struct btrfs_ordered_extent { 71struct btrfs_ordered_extent {
70 /* logical offset in the file */ 72 /* logical offset in the file */
71 u64 file_offset; 73 u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
73 /* disk byte number */ 75 /* disk byte number */
74 u64 start; 76 u64 start;
75 77
76 /* length of the extent in bytes */ 78 /* ram length of the extent in bytes */
77 u64 len; 79 u64 len;
78 80
81 /* extent length on disk */
82 u64 disk_len;
83
79 /* flags (described above) */ 84 /* flags (described above) */
80 unsigned long flags; 85 unsigned long flags;
81 86
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
127int btrfs_dec_test_ordered_pending(struct inode *inode, 132int btrfs_dec_test_ordered_pending(struct inode *inode,
128 u64 file_offset, u64 io_size); 133 u64 file_offset, u64 io_size);
129int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 134int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
130 u64 start, u64 len, int nocow); 135 u64 start, u64 len, u64 disk_len, int nocow,
136 int compressed);
131int btrfs_add_ordered_sum(struct inode *inode, 137int btrfs_add_ordered_sum(struct inode *inode,
132 struct btrfs_ordered_extent *entry, 138 struct btrfs_ordered_extent *entry,
133 struct btrfs_ordered_sum *sum); 139 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f2..64725c13aa11 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
115 if (btrfs_file_extent_type(l, fi) == 115 if (btrfs_file_extent_type(l, fi) ==
116 BTRFS_FILE_EXTENT_INLINE) { 116 BTRFS_FILE_EXTENT_INLINE) {
117 printk("\t\tinline extent data size %u\n", 117 printk("\t\tinline extent data size %u\n",
118 btrfs_file_extent_inline_len(l, item)); 118 btrfs_file_extent_inline_len(l, fi));
119 break; 119 break;
120 } 120 }
121 printk("\t\textent data disk bytenr %llu nr %llu\n", 121 printk("\t\textent data disk bytenr %llu nr %llu\n",
122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi), 122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi)); 123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
124 printk("\t\textent data offset %llu nr %llu\n", 124 printk("\t\textent data offset %llu nr %llu ram %llu\n",
125 (unsigned long long)btrfs_file_extent_offset(l, fi), 125 (unsigned long long)btrfs_file_extent_offset(l, fi),
126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi)); 126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
127 (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
127 break; 128 break;
128 case BTRFS_BLOCK_GROUP_ITEM_KEY: 129 case BTRFS_BLOCK_GROUP_ITEM_KEY:
129 bi = btrfs_item_ptr(l, i, 130 bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7b..431fdf144b58 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
47#include "volumes.h" 47#include "volumes.h"
48#include "version.h" 48#include "version.h"
49#include "export.h" 49#include "export.h"
50#include "compression.h"
50 51
51#define BTRFS_SUPER_MAGIC 0x9123683E 52#define BTRFS_SUPER_MAGIC 0x9123683E
52 53
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
69enum { 70enum {
70 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 71 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
71 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 72 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
72 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err, 73 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
73}; 74};
74 75
75static match_table_t tokens = { 76static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
83 {Opt_max_inline, "max_inline=%s"}, 84 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"}, 85 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"}, 86 {Opt_thread_pool, "thread_pool=%d"},
87 {Opt_compress, "compress"},
86 {Opt_ssd, "ssd"}, 88 {Opt_ssd, "ssd"},
87 {Opt_noacl, "noacl"}, 89 {Opt_noacl, "noacl"},
88 {Opt_err, NULL}, 90 {Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
163 btrfs_set_opt(info->mount_opt, NODATACOW); 165 btrfs_set_opt(info->mount_opt, NODATACOW);
164 btrfs_set_opt(info->mount_opt, NODATASUM); 166 btrfs_set_opt(info->mount_opt, NODATASUM);
165 break; 167 break;
168 case Opt_compress:
169 printk(KERN_INFO "btrfs: use compression\n");
170 btrfs_set_opt(info->mount_opt, COMPRESS);
171 break;
166 case Opt_ssd: 172 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 173 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
168 btrfs_set_opt(info->mount_opt, SSD); 174 btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
622 err = btrfs_interface_init(); 628 err = btrfs_interface_init();
623 if (err) 629 if (err)
624 goto free_extent_map; 630 goto free_extent_map;
631
625 err = register_filesystem(&btrfs_fs_type); 632 err = register_filesystem(&btrfs_fs_type);
626 if (err) 633 if (err)
627 goto unregister_ioctl; 634 goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
651 unregister_filesystem(&btrfs_fs_type); 658 unregister_filesystem(&btrfs_fs_type);
652 btrfs_exit_sysfs(); 659 btrfs_exit_sysfs();
653 btrfs_cleanup_fs_uuids(); 660 btrfs_cleanup_fs_uuids();
661 btrfs_zlib_exit();
654} 662}
655 663
656module_init(init_btrfs_fs) 664module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34a..e6d579053a47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
540 if (found_type == BTRFS_FILE_EXTENT_REG) 540 if (found_type == BTRFS_FILE_EXTENT_REG)
541 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 541 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
542 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 542 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
543 size = btrfs_file_extent_inline_len(eb, 543 size = btrfs_file_extent_inline_len(eb, item);
544 btrfs_item_nr(eb, slot));
545 extent_end = (start + size + mask) & ~mask; 544 extent_end = (start + size + mask) & ~mask;
546 } else { 545 } else {
547 ret = 0; 546 ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51a..7db4cfd03a98 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
1816 em->start = key.offset; 1816 em->start = key.offset;
1817 em->len = *num_bytes; 1817 em->len = *num_bytes;
1818 em->block_start = 0; 1818 em->block_start = 0;
1819 em->block_len = em->len;
1819 1820
1820 if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 1821 if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1821 ret = btrfs_add_system_chunk(trans, chunk_root, &key, 1822 ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2323 em->start = logical; 2324 em->start = logical;
2324 em->len = length; 2325 em->len = length;
2325 em->block_start = 0; 2326 em->block_start = 0;
2327 em->block_len = em->len;
2326 2328
2327 map->num_stripes = num_stripes; 2329 map->num_stripes = num_stripes;
2328 map->io_width = btrfs_chunk_io_width(leaf, chunk); 2330 map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33
34/* Plan: call deflate() with avail_in == *sourcelen,
35 avail_out = *dstlen - 12 and flush == Z_FINISH.
36 If it doesn't manage to finish, call it again with
37 avail_in == 0 and avail_out set to the remaining 12
38 bytes for it to clean up.
39 Q: Is 12 bytes sufficient?
40*/
41#define STREAM_END_SPACE 12
42
43struct workspace {
44 z_stream inf_strm;
45 z_stream def_strm;
46 char *buf;
47 struct list_head list;
48};
49
50static LIST_HEAD(idle_workspace);
51static DEFINE_SPINLOCK(workspace_lock);
52static unsigned long num_workspace;
53static atomic_t alloc_workspace = ATOMIC_INIT(0);
54static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
55
56/*
57 * this finds an available zlib workspace or allocates a new one
58 * NULL or an ERR_PTR is returned if things go bad.
59 */
60static struct workspace *find_zlib_workspace(void)
61{
62 struct workspace *workspace;
63 int ret;
64 int cpus = num_online_cpus();
65
66again:
67 spin_lock(&workspace_lock);
68 if (!list_empty(&idle_workspace)) {
69 workspace = list_entry(idle_workspace.next, struct workspace,
70 list);
71 list_del(&workspace->list);
72 num_workspace--;
73 spin_unlock(&workspace_lock);
74 return workspace;
75
76 }
77 spin_unlock(&workspace_lock);
78 if (atomic_read(&alloc_workspace) > cpus) {
79 DEFINE_WAIT(wait);
80 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
81 if (atomic_read(&alloc_workspace) > cpus)
82 schedule();
83 finish_wait(&workspace_wait, &wait);
84 goto again;
85 }
86 atomic_inc(&alloc_workspace);
87 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
88 if (!workspace) {
89 ret = -ENOMEM;
90 goto fail;
91 }
92
93 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
94 if (!workspace->def_strm.workspace) {
95 ret = -ENOMEM;
96 goto fail;
97 }
98 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
99 if (!workspace->inf_strm.workspace) {
100 ret = -ENOMEM;
101 goto fail_inflate;
102 }
103 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
104 if (!workspace->buf) {
105 ret = -ENOMEM;
106 goto fail_kmalloc;
107 }
108 return workspace;
109
110fail_kmalloc:
111 vfree(workspace->inf_strm.workspace);
112fail_inflate:
113 vfree(workspace->def_strm.workspace);
114fail:
115 kfree(workspace);
116 atomic_dec(&alloc_workspace);
117 wake_up(&workspace_wait);
118 return ERR_PTR(ret);
119}
120
121/*
122 * put a workspace struct back on the list or free it if we have enough
123 * idle ones sitting around
124 */
125static int free_workspace(struct workspace *workspace)
126{
127 spin_lock(&workspace_lock);
128 if (num_workspace < num_online_cpus()) {
129 list_add_tail(&workspace->list, &idle_workspace);
130 num_workspace++;
131 spin_unlock(&workspace_lock);
132 if (waitqueue_active(&workspace_wait))
133 wake_up(&workspace_wait);
134 return 0;
135 }
136 spin_unlock(&workspace_lock);
137 vfree(workspace->def_strm.workspace);
138 vfree(workspace->inf_strm.workspace);
139 kfree(workspace->buf);
140 kfree(workspace);
141
142 atomic_dec(&alloc_workspace);
143 if (waitqueue_active(&workspace_wait))
144 wake_up(&workspace_wait);
145 return 0;
146}
147
148/*
149 * cleanup function for module exit
150 */
151static void free_workspaces(void)
152{
153 struct workspace *workspace;
154 while(!list_empty(&idle_workspace)) {
155 workspace = list_entry(idle_workspace.next, struct workspace,
156 list);
157 list_del(&workspace->list);
158 vfree(workspace->def_strm.workspace);
159 vfree(workspace->inf_strm.workspace);
160 kfree(workspace->buf);
161 kfree(workspace);
162 atomic_dec(&alloc_workspace);
163 }
164}
165
166/*
167 * given an address space and start/len, compress the bytes.
168 *
169 * pages are allocated to hold the compressed result and stored
170 * in 'pages'
171 *
172 * out_pages is used to return the number of pages allocated. There
173 * may be pages allocated even if we return an error
174 *
175 * total_in is used to return the number of bytes actually read. It
176 * may be smaller then len if we had to exit early because we
177 * ran out of room in the pages array or because we cross the
178 * max_out threshold.
179 *
180 * total_out is used to return the total number of compressed bytes
181 *
182 * max_out tells us the max number of bytes that we're allowed to
183 * stuff into pages
184 */
185int btrfs_zlib_compress_pages(struct address_space *mapping,
186 u64 start, unsigned long len,
187 struct page **pages,
188 unsigned long nr_dest_pages,
189 unsigned long *out_pages,
190 unsigned long *total_in,
191 unsigned long *total_out,
192 unsigned long max_out)
193{
194 int ret;
195 struct workspace *workspace;
196 char *data_in;
197 char *cpage_out;
198 int nr_pages = 0;
199 struct page *in_page = NULL;
200 struct page *out_page = NULL;
201 int out_written = 0;
202 int in_read = 0;
203 unsigned long bytes_left;
204
205 *out_pages = 0;
206 *total_out = 0;
207 *total_in = 0;
208
209 workspace = find_zlib_workspace();
210 if (!workspace)
211 return -1;
212
213 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
214 printk(KERN_WARNING "deflateInit failed\n");
215 ret = -1;
216 goto out;
217 }
218
219 workspace->def_strm.total_in = 0;
220 workspace->def_strm.total_out = 0;
221
222 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
223 data_in = kmap(in_page);
224
225 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
226 cpage_out = kmap(out_page);
227 pages[0] = out_page;
228 nr_pages = 1;
229
230 workspace->def_strm.next_in = data_in;
231 workspace->def_strm.next_out = cpage_out;
232 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
233 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
234
235 out_written = 0;
236 in_read = 0;
237
238 while (workspace->def_strm.total_in < len) {
239 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
240 if (ret != Z_OK) {
241 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
242 ret);
243 zlib_deflateEnd(&workspace->def_strm);
244 ret = -1;
245 goto out;
246 }
247
248 /* we're making it bigger, give up */
249 if (workspace->def_strm.total_in > 8192 &&
250 workspace->def_strm.total_in <
251 workspace->def_strm.total_out) {
252 ret = -1;
253 goto out;
254 }
255 /* we need another page for writing out. Test this
256 * before the total_in so we will pull in a new page for
257 * the stream end if required
258 */
259 if (workspace->def_strm.avail_out == 0) {
260 kunmap(out_page);
261 if (nr_pages == nr_dest_pages) {
262 out_page = NULL;
263 ret = -1;
264 goto out;
265 }
266 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
267 cpage_out = kmap(out_page);
268 pages[nr_pages] = out_page;
269 nr_pages++;
270 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
271 workspace->def_strm.next_out = cpage_out;
272 }
273 /* we're all done */
274 if (workspace->def_strm.total_in >= len)
275 break;
276
277 /* we've read in a full page, get a new one */
278 if (workspace->def_strm.avail_in == 0) {
279 if (workspace->def_strm.total_out > max_out)
280 break;
281
282 bytes_left = len - workspace->def_strm.total_in;
283 kunmap(in_page);
284 page_cache_release(in_page);
285
286 start += PAGE_CACHE_SIZE;
287 in_page = find_get_page(mapping,
288 start >> PAGE_CACHE_SHIFT);
289 data_in = kmap(in_page);
290 workspace->def_strm.avail_in = min(bytes_left,
291 PAGE_CACHE_SIZE);
292 workspace->def_strm.next_in = data_in;
293 }
294 }
295 workspace->def_strm.avail_in = 0;
296 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
297 zlib_deflateEnd(&workspace->def_strm);
298
299 if (ret != Z_STREAM_END) {
300 ret = -1;
301 goto out;
302 }
303
304 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
305 ret = -1;
306 goto out;
307 }
308
309 ret = 0;
310 *total_out = workspace->def_strm.total_out;
311 *total_in = workspace->def_strm.total_in;
312out:
313 *out_pages = nr_pages;
314 if (out_page)
315 kunmap(out_page);
316
317 if (in_page) {
318 kunmap(in_page);
319 page_cache_release(in_page);
320 }
321 free_workspace(workspace);
322 return ret;
323}
324
325/*
326 * pages_in is an array of pages with compressed data.
327 *
328 * disk_start is the starting logical offset of this array in the file
329 *
330 * bvec is a bio_vec of pages from the file that we want to decompress into
331 *
332 * vcnt is the count of pages in the biovec
333 *
334 * srclen is the number of bytes in pages_in
335 *
336 * The basic idea is that we have a bio that was created by readpages.
337 * The pages in the bio are for the uncompressed data, and they may not
338 * be contiguous. They all correspond to the range of bytes covered by
339 * the compressed extent.
340 */
341int btrfs_zlib_decompress_biovec(struct page **pages_in,
342 u64 disk_start,
343 struct bio_vec *bvec,
344 int vcnt,
345 size_t srclen)
346{
347 int ret = 0;
348 int wbits = MAX_WBITS;
349 struct workspace *workspace;
350 char *data_in;
351 size_t total_out = 0;
352 unsigned long page_bytes_left;
353 unsigned long page_in_index = 0;
354 unsigned long page_out_index = 0;
355 struct page *page_out;
356 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
357 PAGE_CACHE_SIZE;
358 unsigned long buf_start;
359 unsigned long buf_offset;
360 unsigned long bytes;
361 unsigned long working_bytes;
362 unsigned long pg_offset;
363 unsigned long start_byte;
364 unsigned long current_buf_start;
365 char *kaddr;
366
367 workspace = find_zlib_workspace();
368 if (!workspace)
369 return -ENOMEM;
370
371 data_in = kmap(pages_in[page_in_index]);
372 workspace->inf_strm.next_in = data_in;
373 workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
374 workspace->inf_strm.total_in = 0;
375
376 workspace->inf_strm.total_out = 0;
377 workspace->inf_strm.next_out = workspace->buf;
378 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
379 page_out = bvec[page_out_index].bv_page;
380 page_bytes_left = PAGE_CACHE_SIZE;
381 pg_offset = 0;
382
383 /* If it's deflate, and it's got no preset dictionary, then
384 we can tell zlib to skip the adler32 check. */
385 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
386 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
387 !(((data_in[0]<<8) + data_in[1]) % 31)) {
388
389 wbits = -((data_in[0] >> 4) + 8);
390 workspace->inf_strm.next_in += 2;
391 workspace->inf_strm.avail_in -= 2;
392 }
393
394 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
395 printk(KERN_WARNING "inflateInit failed\n");
396 ret = -1;
397 goto out;
398 }
399 while(workspace->inf_strm.total_in < srclen) {
400 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
401 if (ret != Z_OK && ret != Z_STREAM_END) {
402 break;
403 }
404
405 /*
406 * buf start is the byte offset we're of the start of
407 * our workspace buffer
408 */
409 buf_start = total_out;
410
411 /* total_out is the last byte of the workspace buffer */
412 total_out = workspace->inf_strm.total_out;
413
414 working_bytes = total_out - buf_start;
415
416 /*
417 * start byte is the first byte of the page we're currently
418 * copying into relative to the start of the compressed data.
419 */
420 start_byte = page_offset(page_out) - disk_start;
421
422 if (working_bytes == 0) {
423 /* we didn't make progress in this inflate
424 * call, we're done
425 */
426 if (ret != Z_STREAM_END)
427 ret = -1;
428 break;
429 }
430
431 /* we haven't yet hit data corresponding to this page */
432 if (total_out <= start_byte) {
433 goto next;
434 }
435
436 /*
437 * the start of the data we care about is offset into
438 * the middle of our working buffer
439 */
440 if (total_out > start_byte && buf_start < start_byte) {
441 buf_offset = start_byte - buf_start;
442 working_bytes -= buf_offset;
443 } else {
444 buf_offset = 0;
445 }
446 current_buf_start = buf_start;
447
448 /* copy bytes from the working buffer into the pages */
449 while(working_bytes > 0) {
450 bytes = min(PAGE_CACHE_SIZE - pg_offset,
451 PAGE_CACHE_SIZE - buf_offset);
452 bytes = min(bytes, working_bytes);
453 kaddr = kmap_atomic(page_out, KM_USER0);
454 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
455 bytes);
456 kunmap_atomic(kaddr, KM_USER0);
457 flush_dcache_page(page_out);
458
459 pg_offset += bytes;
460 page_bytes_left -= bytes;
461 buf_offset += bytes;
462 working_bytes -= bytes;
463 current_buf_start += bytes;
464
465 /* check if we need to pick another page */
466 if (page_bytes_left == 0) {
467 page_out_index++;
468 if (page_out_index >= vcnt) {
469 ret = 0;
470 goto done;
471 }
472 page_out = bvec[page_out_index].bv_page;
473 pg_offset = 0;
474 page_bytes_left = PAGE_CACHE_SIZE;
475 start_byte = page_offset(page_out) - disk_start;
476
477 /*
478 * make sure our new page is covered by this
479 * working buffer
480 */
481 if (total_out <= start_byte) {
482 goto next;
483 }
484
485 /* the next page in the biovec might not
486 * be adjacent to the last page, but it
487 * might still be found inside this working
488 * buffer. bump our offset pointer
489 */
490 if (total_out > start_byte &&
491 current_buf_start < start_byte) {
492 buf_offset = start_byte - buf_start;
493 working_bytes = total_out - start_byte;
494 current_buf_start = buf_start +
495 buf_offset;
496 }
497 }
498 }
499next:
500 workspace->inf_strm.next_out = workspace->buf;
501 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
502
503 if (workspace->inf_strm.avail_in == 0) {
504 unsigned long tmp;
505 kunmap(pages_in[page_in_index]);
506 page_in_index++;
507 if (page_in_index >= total_pages_in) {
508 data_in = NULL;
509 break;
510 }
511 data_in = kmap(pages_in[page_in_index]);
512 workspace->inf_strm.next_in = data_in;
513 tmp = srclen - workspace->inf_strm.total_in;
514 workspace->inf_strm.avail_in = min(tmp,
515 PAGE_CACHE_SIZE);
516 }
517 }
518 if (ret != Z_STREAM_END) {
519 ret = -1;
520 } else {
521 ret = 0;
522 }
523done:
524 zlib_inflateEnd(&workspace->inf_strm);
525 if (data_in)
526 kunmap(pages_in[page_in_index]);
527out:
528 free_workspace(workspace);
529 return ret;
530}
531
532/*
533 * a less complex decompression routine. Our compressed data fits in a
534 * single page, and we want to read a single page out of it.
535 * start_byte tells us the offset into the compressed data we're interested in
536 */
537int btrfs_zlib_decompress(unsigned char *data_in,
538 struct page *dest_page,
539 unsigned long start_byte,
540 size_t srclen, size_t destlen)
541{
542 int ret = 0;
543 int wbits = MAX_WBITS;
544 struct workspace *workspace;
545 unsigned long bytes_left = destlen;
546 unsigned long total_out = 0;
547 char *kaddr;
548
549 if (destlen > PAGE_CACHE_SIZE)
550 return -ENOMEM;
551
552 workspace = find_zlib_workspace();
553 if (!workspace)
554 return -ENOMEM;
555
556 workspace->inf_strm.next_in = data_in;
557 workspace->inf_strm.avail_in = srclen;
558 workspace->inf_strm.total_in = 0;
559
560 workspace->inf_strm.next_out = workspace->buf;
561 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
562 workspace->inf_strm.total_out = 0;
563 /* If it's deflate, and it's got no preset dictionary, then
564 we can tell zlib to skip the adler32 check. */
565 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
566 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
567 !(((data_in[0]<<8) + data_in[1]) % 31)) {
568
569 wbits = -((data_in[0] >> 4) + 8);
570 workspace->inf_strm.next_in += 2;
571 workspace->inf_strm.avail_in -= 2;
572 }
573
574 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
575 printk(KERN_WARNING "inflateInit failed\n");
576 ret = -1;
577 goto out;
578 }
579
580 while(bytes_left > 0) {
581 unsigned long buf_start;
582 unsigned long buf_offset;
583 unsigned long bytes;
584 unsigned long pg_offset = 0;
585
586 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
587 if (ret != Z_OK && ret != Z_STREAM_END) {
588 break;
589 }
590
591 buf_start = total_out;
592 total_out = workspace->inf_strm.total_out;
593
594 if (total_out == buf_start) {
595 ret = -1;
596 break;
597 }
598
599 if (total_out <= start_byte) {
600 goto next;
601 }
602
603 if (total_out > start_byte && buf_start < start_byte) {
604 buf_offset = start_byte - buf_start;
605 } else {
606 buf_offset = 0;
607 }
608
609 bytes = min(PAGE_CACHE_SIZE - pg_offset,
610 PAGE_CACHE_SIZE - buf_offset);
611 bytes = min(bytes, bytes_left);
612
613 kaddr = kmap_atomic(dest_page, KM_USER0);
614 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
615 kunmap_atomic(kaddr, KM_USER0);
616
617 pg_offset += bytes;
618 bytes_left -= bytes;
619next:
620 workspace->inf_strm.next_out = workspace->buf;
621 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
622 }
623 if (ret != Z_STREAM_END && bytes_left != 0) {
624 ret = -1;
625 } else {
626 ret = 0;
627 }
628 zlib_inflateEnd(&workspace->inf_strm);
629out:
630 free_workspace(workspace);
631 return ret;
632}
633
634void btrfs_zlib_exit(void)
635{
636 free_workspaces();
637}