aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/inode.c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c584
1 files changed, 539 insertions, 45 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d6..9797592dc86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
49#include "compat.h" 49#include "compat.h"
50#include "tree-log.h" 50#include "tree-log.h"
51#include "ref-cache.h" 51#include "ref-cache.h"
52#include "compression.h"
52 53
53struct btrfs_iget_args { 54struct btrfs_iget_args {
54 u64 ino; 55 u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
83}; 84};
84 85
85static void btrfs_truncate(struct inode *inode); 86static void btrfs_truncate(struct inode *inode);
87static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
86 88
87/* 89/*
88 * a very lame attempt at stopping writes when the FS is 85% full. There 90 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
114} 116}
115 117
116/* 118/*
119 * this does all the hard work for inserting an inline extent into
120 * the btree. The caller should have done a btrfs_drop_extents so that
121 * no overlapping inline items exist in the btree
122 */
123static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
124 struct btrfs_root *root, struct inode *inode,
125 u64 start, size_t size, size_t compressed_size,
126 struct page **compressed_pages)
127{
128 struct btrfs_key key;
129 struct btrfs_path *path;
130 struct extent_buffer *leaf;
131 struct page *page = NULL;
132 char *kaddr;
133 unsigned long ptr;
134 struct btrfs_file_extent_item *ei;
135 int err = 0;
136 int ret;
137 size_t cur_size = size;
138 size_t datasize;
139 unsigned long offset;
140 int use_compress = 0;
141
142 if (compressed_size && compressed_pages) {
143 use_compress = 1;
144 cur_size = compressed_size;
145 }
146
147 path = btrfs_alloc_path(); if (!path)
148 return -ENOMEM;
149
150 btrfs_set_trans_block_group(trans, inode);
151
152 key.objectid = inode->i_ino;
153 key.offset = start;
154 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
155 inode_add_bytes(inode, size);
156 datasize = btrfs_file_extent_calc_inline_size(cur_size);
157
158 inode_add_bytes(inode, size);
159 ret = btrfs_insert_empty_item(trans, root, path, &key,
160 datasize);
161 BUG_ON(ret);
162 if (ret) {
163 err = ret;
164 printk("got bad ret %d\n", ret);
165 goto fail;
166 }
167 leaf = path->nodes[0];
168 ei = btrfs_item_ptr(leaf, path->slots[0],
169 struct btrfs_file_extent_item);
170 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172 btrfs_set_file_extent_encryption(leaf, ei, 0);
173 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175 ptr = btrfs_file_extent_inline_start(ei);
176
177 if (use_compress) {
178 struct page *cpage;
179 int i = 0;
180 while(compressed_size > 0) {
181 cpage = compressed_pages[i];
182 cur_size = min(compressed_size,
183 PAGE_CACHE_SIZE);
184
185 kaddr = kmap(cpage);
186 write_extent_buffer(leaf, kaddr, ptr, cur_size);
187 kunmap(cpage);
188
189 i++;
190 ptr += cur_size;
191 compressed_size -= cur_size;
192 }
193 btrfs_set_file_extent_compression(leaf, ei,
194 BTRFS_COMPRESS_ZLIB);
195 } else {
196 page = find_get_page(inode->i_mapping,
197 start >> PAGE_CACHE_SHIFT);
198 btrfs_set_file_extent_compression(leaf, ei, 0);
199 kaddr = kmap_atomic(page, KM_USER0);
200 offset = start & (PAGE_CACHE_SIZE - 1);
201 write_extent_buffer(leaf, kaddr + offset, ptr, size);
202 kunmap_atomic(kaddr, KM_USER0);
203 page_cache_release(page);
204 }
205 btrfs_mark_buffer_dirty(leaf);
206 btrfs_free_path(path);
207
208 BTRFS_I(inode)->disk_i_size = inode->i_size;
209 btrfs_update_inode(trans, root, inode);
210 return 0;
211fail:
212 btrfs_free_path(path);
213 return err;
214}
215
216
217/*
218 * conditionally insert an inline extent into the file. This
219 * does the checks required to make sure the data is small enough
220 * to fit as an inline extent.
221 */
222static int cow_file_range_inline(struct btrfs_trans_handle *trans,
223 struct btrfs_root *root,
224 struct inode *inode, u64 start, u64 end,
225 size_t compressed_size,
226 struct page **compressed_pages)
227{
228 u64 isize = i_size_read(inode);
229 u64 actual_end = min(end + 1, isize);
230 u64 inline_len = actual_end - start;
231 u64 aligned_end = (end + root->sectorsize - 1) &
232 ~((u64)root->sectorsize - 1);
233 u64 hint_byte;
234 u64 data_len = inline_len;
235 int ret;
236
237 if (compressed_size)
238 data_len = compressed_size;
239
240 if (start > 0 ||
241 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
242 (!compressed_size &&
243 (actual_end & (root->sectorsize - 1)) == 0) ||
244 end + 1 < isize ||
245 data_len > root->fs_info->max_inline) {
246 return 1;
247 }
248
249 mutex_lock(&BTRFS_I(inode)->extent_mutex);
250 ret = btrfs_drop_extents(trans, root, inode, start,
251 aligned_end, aligned_end, &hint_byte);
252 BUG_ON(ret);
253
254 if (isize > actual_end)
255 inline_len = min_t(u64, isize, actual_end);
256 ret = insert_inline_extent(trans, root, inode, start,
257 inline_len, compressed_size,
258 compressed_pages);
259 BUG_ON(ret);
260 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
261 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
262 return 0;
263}
264
265/*
117 * when extent_io.c finds a delayed allocation range in the file, 266 * when extent_io.c finds a delayed allocation range in the file,
118 * the call backs end up in this code. The basic idea is to 267 * the call backs end up in this code. The basic idea is to
119 * allocate extents on disk for the range, and create ordered data structs 268 * allocate extents on disk for the range, and create ordered data structs
120 * in ram to track those extents. 269 * in ram to track those extents.
270 *
271 * locked_page is the page that writepage had locked already. We use
272 * it to make sure we don't do extra locks or unlocks.
273 *
274 * *page_started is set to one if we unlock locked_page and do everything
275 * required to start IO on it. It may be clean and already done with
276 * IO when we return.
121 */ 277 */
122static int cow_file_range(struct inode *inode, u64 start, u64 end) 278static int cow_file_range(struct inode *inode, struct page *locked_page,
279 u64 start, u64 end, int *page_started)
123{ 280{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 281 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct btrfs_trans_handle *trans; 282 struct btrfs_trans_handle *trans;
126 u64 alloc_hint = 0; 283 u64 alloc_hint = 0;
127 u64 num_bytes; 284 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start;
287 u64 disk_num_bytes;
128 u64 cur_alloc_size; 288 u64 cur_alloc_size;
129 u64 blocksize = root->sectorsize; 289 u64 blocksize = root->sectorsize;
130 u64 orig_num_bytes; 290 u64 actual_end;
131 struct btrfs_key ins; 291 struct btrfs_key ins;
132 struct extent_map *em; 292 struct extent_map *em;
133 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
134 int ret = 0; 294 int ret = 0;
295 struct page **pages = NULL;
296 unsigned long nr_pages;
297 unsigned long nr_pages_ret = 0;
298 unsigned long total_compressed = 0;
299 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024;
302 int i;
303 int will_compress;
135 304
136 trans = btrfs_join_transaction(root, 1); 305 trans = btrfs_join_transaction(root, 1);
137 BUG_ON(!trans); 306 BUG_ON(!trans);
138 btrfs_set_trans_block_group(trans, inode); 307 btrfs_set_trans_block_group(trans, inode);
308 orig_start = start;
309
310 /*
311 * compression made this loop a bit ugly, but the basic idea is to
312 * compress some pages but keep the total size of the compressed
313 * extent relatively small. If compression is off, this goto target
314 * is never used.
315 */
316again:
317 will_compress = 0;
318 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
319 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
139 320
321 actual_end = min_t(u64, i_size_read(inode), end + 1);
322 total_compressed = actual_end - start;
323
324 /* we want to make sure that amount of ram required to uncompress
325 * an extent is reasonable, so we limit the total size in ram
326 * of a compressed extent to 256k
327 */
328 total_compressed = min(total_compressed, max_uncompressed);
140 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 329 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
141 num_bytes = max(blocksize, num_bytes); 330 num_bytes = max(blocksize, num_bytes);
142 orig_num_bytes = num_bytes; 331 disk_num_bytes = num_bytes;
332 total_in = 0;
333 ret = 0;
143 334
144 if (alloc_hint == EXTENT_MAP_INLINE) 335 /* we do compression for mount -o compress and when the
145 goto out; 336 * inode has not been flagged as nocompress
337 */
338 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
339 btrfs_test_opt(root, COMPRESS)) {
340 WARN_ON(pages);
341 pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
342
343 /* we want to make sure the amount of IO required to satisfy
344 * a random read is reasonably small, so we limit the size
345 * of a compressed extent to 128k
346 */
347 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
348 total_compressed, pages,
349 nr_pages, &nr_pages_ret,
350 &total_in,
351 &total_compressed,
352 max_compressed);
353
354 if (!ret) {
355 unsigned long offset = total_compressed &
356 (PAGE_CACHE_SIZE - 1);
357 struct page *page = pages[nr_pages_ret - 1];
358 char *kaddr;
359
360 /* zero the tail end of the last page, we might be
361 * sending it down to disk
362 */
363 if (offset) {
364 kaddr = kmap_atomic(page, KM_USER0);
365 memset(kaddr + offset, 0,
366 PAGE_CACHE_SIZE - offset);
367 kunmap_atomic(kaddr, KM_USER0);
368 }
369 will_compress = 1;
370 }
371 }
372 if (start == 0) {
373 /* lets try to make an inline extent */
374 if (ret || total_in < (end - start + 1)) {
375 /* we didn't compress the entire range, try
376 * to make an uncompressed inline extent. This
377 * is almost sure to fail, but maybe inline sizes
378 * will get bigger later
379 */
380 ret = cow_file_range_inline(trans, root, inode,
381 start, end, 0, NULL);
382 } else {
383 ret = cow_file_range_inline(trans, root, inode,
384 start, end,
385 total_compressed, pages);
386 }
387 if (ret == 0) {
388 extent_clear_unlock_delalloc(inode,
389 &BTRFS_I(inode)->io_tree,
390 start, end, NULL,
391 1, 1, 1);
392 *page_started = 1;
393 ret = 0;
394 goto free_pages_out;
395 }
396 }
397
398 if (will_compress) {
399 /*
400 * we aren't doing an inline extent round the compressed size
401 * up to a block size boundary so the allocator does sane
402 * things
403 */
404 total_compressed = (total_compressed + blocksize - 1) &
405 ~(blocksize - 1);
406
407 /*
408 * one last check to make sure the compression is really a
409 * win, compare the page count read with the blocks on disk
410 */
411 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
412 ~(PAGE_CACHE_SIZE - 1);
413 if (total_compressed >= total_in) {
414 will_compress = 0;
415 } else {
416 disk_num_bytes = total_compressed;
417 num_bytes = total_in;
418 }
419 }
420 if (!will_compress && pages) {
421 /*
422 * the compression code ran but failed to make things smaller,
423 * free any pages it allocated and our page pointer array
424 */
425 for (i = 0; i < nr_pages_ret; i++) {
426 page_cache_release(pages[i]);
427 }
428 kfree(pages);
429 pages = NULL;
430 total_compressed = 0;
431 nr_pages_ret = 0;
432
433 /* flag the file so we don't compress in the future */
434 btrfs_set_flag(inode, NOCOMPRESS);
435 }
436
437 BUG_ON(disk_num_bytes >
438 btrfs_super_total_bytes(&root->fs_info->super_copy));
146 439
147 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
148 mutex_lock(&BTRFS_I(inode)->extent_mutex); 440 mutex_lock(&BTRFS_I(inode)->extent_mutex);
149 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 441 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
150 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 442 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
151 443
152 while(num_bytes > 0) { 444 while(disk_num_bytes > 0) {
153 cur_alloc_size = min(num_bytes, root->fs_info->max_extent); 445 unsigned long min_bytes;
446
447 /*
448 * the max size of a compressed extent is pretty small,
449 * make the code a little less complex by forcing
450 * the allocator to find a whole compressed extent at once
451 */
452 if (will_compress)
453 min_bytes = disk_num_bytes;
454 else
455 min_bytes = root->sectorsize;
456
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
154 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
155 root->sectorsize, 0, alloc_hint, 459 min_bytes, 0, alloc_hint,
156 (u64)-1, &ins, 1); 460 (u64)-1, &ins, 1);
157 if (ret) { 461 if (ret) {
158 WARN_ON(1); 462 WARN_ON(1);
159 goto out; 463 goto free_pages_out_fail;
160 } 464 }
161 em = alloc_extent_map(GFP_NOFS); 465 em = alloc_extent_map(GFP_NOFS);
162 em->start = start; 466 em->start = start;
163 em->len = ins.offset; 467
468 if (will_compress) {
469 ram_size = num_bytes;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476
164 em->block_start = ins.objectid; 477 em->block_start = ins.objectid;
478 em->block_len = ins.offset;
165 em->bdev = root->fs_info->fs_devices->latest_bdev; 479 em->bdev = root->fs_info->fs_devices->latest_bdev;
480
166 mutex_lock(&BTRFS_I(inode)->extent_mutex); 481 mutex_lock(&BTRFS_I(inode)->extent_mutex);
167 set_bit(EXTENT_FLAG_PINNED, &em->flags); 482 set_bit(EXTENT_FLAG_PINNED, &em->flags);
483
484 if (will_compress)
485 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
486
168 while(1) { 487 while(1) {
169 spin_lock(&em_tree->lock); 488 spin_lock(&em_tree->lock);
170 ret = add_extent_mapping(em_tree, em); 489 ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
174 break; 493 break;
175 } 494 }
176 btrfs_drop_extent_cache(inode, start, 495 btrfs_drop_extent_cache(inode, start,
177 start + ins.offset - 1, 0); 496 start + ram_size - 1, 0);
178 } 497 }
179 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 498 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
180 499
181 cur_alloc_size = ins.offset; 500 cur_alloc_size = ins.offset;
182 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 501 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
183 ins.offset, 0); 502 ram_size, cur_alloc_size, 0,
503 will_compress);
184 BUG_ON(ret); 504 BUG_ON(ret);
185 if (num_bytes < cur_alloc_size) { 505
186 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, 506 if (disk_num_bytes < cur_alloc_size) {
507 printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
187 cur_alloc_size); 508 cur_alloc_size);
188 break; 509 break;
189 } 510 }
511
512 if (will_compress) {
513 /*
514 * we're doing compression, we and we need to
515 * submit the compressed extents down to the device.
516 *
517 * We lock down all the file pages, clearing their
518 * dirty bits and setting them writeback. Everyone
519 * that wants to modify the page will wait on the
520 * ordered extent above.
521 *
522 * The writeback bits on the file pages are
523 * cleared when the compressed pages are on disk
524 */
525 btrfs_end_transaction(trans, root);
526
527 if (start <= page_offset(locked_page) &&
528 page_offset(locked_page) < start + ram_size) {
529 *page_started = 1;
530 }
531
532 extent_clear_unlock_delalloc(inode,
533 &BTRFS_I(inode)->io_tree,
534 start,
535 start + ram_size - 1,
536 NULL, 1, 1, 0);
537
538 ret = btrfs_submit_compressed_write(inode, start,
539 ram_size, ins.objectid,
540 cur_alloc_size, pages,
541 nr_pages_ret);
542
543 BUG_ON(ret);
544 trans = btrfs_join_transaction(root, 1);
545 if (start + ram_size < end) {
546 start += ram_size;
547 alloc_hint = ins.objectid + ins.offset;
548 /* pages will be freed at end_bio time */
549 pages = NULL;
550 goto again;
551 } else {
552 /* we've written everything, time to go */
553 break;
554 }
555 }
556 /* we're not doing compressed IO, don't unlock the first
557 * page (which the caller expects to stay locked), don't
558 * clear any dirty bits and don't set any writeback bits
559 */
560 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
561 start, start + ram_size - 1,
562 locked_page, 0, 0, 0);
563 disk_num_bytes -= cur_alloc_size;
190 num_bytes -= cur_alloc_size; 564 num_bytes -= cur_alloc_size;
191 alloc_hint = ins.objectid + ins.offset; 565 alloc_hint = ins.objectid + ins.offset;
192 start += cur_alloc_size; 566 start += cur_alloc_size;
193 } 567 }
568
569 ret = 0;
194out: 570out:
195 btrfs_end_transaction(trans, root); 571 btrfs_end_transaction(trans, root);
572
196 return ret; 573 return ret;
574
575free_pages_out_fail:
576 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
577 start, end, locked_page, 0, 0, 0);
578free_pages_out:
579 for (i = 0; i < nr_pages_ret; i++)
580 page_cache_release(pages[i]);
581 if (pages)
582 kfree(pages);
583
584 goto out;
197} 585}
198 586
199/* 587/*
@@ -203,7 +591,8 @@ out:
203 * If no cow copies or snapshots exist, we write directly to the existing 591 * If no cow copies or snapshots exist, we write directly to the existing
204 * blocks on disk 592 * blocks on disk
205 */ 593 */
206static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) 594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started)
207{ 596{
208 u64 extent_start; 597 u64 extent_start;
209 u64 extent_end; 598 u64 extent_end;
@@ -260,6 +649,11 @@ again:
260 extent_end = extent_start + extent_num_bytes; 649 extent_end = extent_start + extent_num_bytes;
261 err = 0; 650 err = 0;
262 651
652 if (btrfs_file_extent_compression(leaf, item) ||
653 btrfs_file_extent_encryption(leaf,item) ||
654 btrfs_file_extent_other_encoding(leaf, item))
655 goto not_found;
656
263 if (loops && start != extent_start) 657 if (loops && start != extent_start)
264 goto not_found; 658 goto not_found;
265 659
@@ -284,7 +678,8 @@ again:
284 bytenr += btrfs_file_extent_offset(leaf, item); 678 bytenr += btrfs_file_extent_offset(leaf, item);
285 extent_num_bytes = min(end + 1, extent_end) - start; 679 extent_num_bytes = min(end + 1, extent_end) - start;
286 ret = btrfs_add_ordered_extent(inode, start, bytenr, 680 ret = btrfs_add_ordered_extent(inode, start, bytenr,
287 extent_num_bytes, 1); 681 extent_num_bytes,
682 extent_num_bytes, 1, 0);
288 if (ret) { 683 if (ret) {
289 err = ret; 684 err = ret;
290 goto out; 685 goto out;
@@ -300,7 +695,8 @@ again:
300not_found: 695not_found:
301 btrfs_end_transaction(trans, root); 696 btrfs_end_transaction(trans, root);
302 btrfs_free_path(path); 697 btrfs_free_path(path);
303 return cow_file_range(inode, start, end); 698 return cow_file_range(inode, locked_page, start, end,
699 page_started);
304 } 700 }
305out: 701out:
306 WARN_ON(err); 702 WARN_ON(err);
@@ -312,16 +708,19 @@ out:
312/* 708/*
313 * extent_io.c call back to do delayed allocation processing 709 * extent_io.c call back to do delayed allocation processing
314 */ 710 */
315static int run_delalloc_range(struct inode *inode, u64 start, u64 end) 711static int run_delalloc_range(struct inode *inode, struct page *locked_page,
712 u64 start, u64 end, int *page_started)
316{ 713{
317 struct btrfs_root *root = BTRFS_I(inode)->root; 714 struct btrfs_root *root = BTRFS_I(inode)->root;
318 int ret; 715 int ret;
319 716
320 if (btrfs_test_opt(root, NODATACOW) || 717 if (btrfs_test_opt(root, NODATACOW) ||
321 btrfs_test_flag(inode, NODATACOW)) 718 btrfs_test_flag(inode, NODATACOW))
322 ret = run_delalloc_nocow(inode, start, end); 719 ret = run_delalloc_nocow(inode, locked_page, start, end,
720 page_started);
323 else 721 else
324 ret = cow_file_range(inode, start, end); 722 ret = cow_file_range(inode, locked_page, start, end,
723 page_started);
325 724
326 return ret; 725 return ret;
327} 726}
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
383 * we don't create bios that span stripes or chunks 782 * we don't create bios that span stripes or chunks
384 */ 783 */
385int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 784int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
386 size_t size, struct bio *bio) 785 size_t size, struct bio *bio,
786 unsigned long bio_flags)
387{ 787{
388 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 788 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
389 struct btrfs_mapping_tree *map_tree; 789 struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
413 * are inserted into the btree 813 * are inserted into the btree
414 */ 814 */
415int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 815int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
416 int mirror_num) 816 int mirror_num, unsigned long bio_flags)
417{ 817{
418 struct btrfs_root *root = BTRFS_I(inode)->root; 818 struct btrfs_root *root = BTRFS_I(inode)->root;
419 int ret = 0; 819 int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
429 * or reading the csums from the tree before a read 829 * or reading the csums from the tree before a read
430 */ 830 */
431int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 831int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
432 int mirror_num) 832 int mirror_num, unsigned long bio_flags)
433{ 833{
434 struct btrfs_root *root = BTRFS_I(inode)->root; 834 struct btrfs_root *root = BTRFS_I(inode)->root;
435 int ret = 0; 835 int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
444 844
445 if (!(rw & (1 << BIO_RW))) { 845 if (!(rw & (1 << BIO_RW))) {
446 btrfs_lookup_bio_sums(root, inode, bio); 846 btrfs_lookup_bio_sums(root, inode, bio);
847
848 if (bio_flags & EXTENT_BIO_COMPRESSED) {
849 return btrfs_submit_compressed_read(inode, bio,
850 mirror_num, bio_flags);
851 }
852
447 goto mapit; 853 goto mapit;
448 } 854 }
449 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 855 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
450 inode, rw, bio, mirror_num, 856 inode, rw, bio, mirror_num,
451 __btrfs_submit_bio_hook); 857 bio_flags, __btrfs_submit_bio_hook);
452mapit: 858mapit:
453 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 859 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
454} 860}
@@ -539,7 +945,7 @@ out_page:
539 * good idea. This causes problems because we want to make sure COW 945 * good idea. This causes problems because we want to make sure COW
540 * properly happens and the data=ordered rules are followed. 946 * properly happens and the data=ordered rules are followed.
541 * 947 *
542 * In our case any range that doesn't have the EXTENT_ORDERED bit set 948 * In our case any range that doesn't have the ORDERED bit set
543 * hasn't been properly setup for IO. We kick off an async process 949 * hasn't been properly setup for IO. We kick off an async process
544 * to fix it up. The async helper will wait for ordered extents, set 950 * to fix it up. The async helper will wait for ordered extents, set
545 * the delalloc bit and make it safe to write the page. 951 * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
632 btrfs_set_file_extent_disk_bytenr(leaf, extent_item, 1038 btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
633 ordered_extent->start); 1039 ordered_extent->start);
634 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, 1040 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
635 ordered_extent->len); 1041 ordered_extent->disk_len);
636 btrfs_set_file_extent_offset(leaf, extent_item, 0); 1042 btrfs_set_file_extent_offset(leaf, extent_item, 0);
1043
1044 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1045 btrfs_set_file_extent_compression(leaf, extent_item, 1);
1046 else
1047 btrfs_set_file_extent_compression(leaf, extent_item, 0);
1048 btrfs_set_file_extent_encryption(leaf, extent_item, 0);
1049 btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
1050
1051 /* ram bytes = extent_num_bytes for now */
637 btrfs_set_file_extent_num_bytes(leaf, extent_item, 1052 btrfs_set_file_extent_num_bytes(leaf, extent_item,
638 ordered_extent->len); 1053 ordered_extent->len);
1054 btrfs_set_file_extent_ram_bytes(leaf, extent_item,
1055 ordered_extent->len);
639 btrfs_mark_buffer_dirty(leaf); 1056 btrfs_mark_buffer_dirty(leaf);
640 1057
641 btrfs_drop_extent_cache(inode, ordered_extent->file_offset, 1058 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
644 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1061 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
645 1062
646 ins.objectid = ordered_extent->start; 1063 ins.objectid = ordered_extent->start;
647 ins.offset = ordered_extent->len; 1064 ins.offset = ordered_extent->disk_len;
648 ins.type = BTRFS_EXTENT_ITEM_KEY; 1065 ins.type = BTRFS_EXTENT_ITEM_KEY;
649 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, 1066 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
650 root->root_key.objectid, 1067 root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
714 int ret; 1131 int ret;
715 int rw; 1132 int rw;
716 u64 logical; 1133 u64 logical;
1134 unsigned long bio_flags = 0;
717 1135
718 ret = get_state_private(failure_tree, start, &private); 1136 ret = get_state_private(failure_tree, start, &private);
719 if (ret) { 1137 if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
738 } 1156 }
739 logical = start - em->start; 1157 logical = start - em->start;
740 logical = em->block_start + logical; 1158 logical = em->block_start + logical;
1159 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1160 bio_flags = EXTENT_BIO_COMPRESSED;
741 failrec->logical = logical; 1161 failrec->logical = logical;
742 free_extent_map(em); 1162 free_extent_map(em);
743 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1163 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
781 rw = READ; 1201 rw = READ;
782 1202
783 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1203 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
784 failrec->last_mirror); 1204 failrec->last_mirror,
1205 bio_flags);
785 return 0; 1206 return 0;
786} 1207}
787 1208
@@ -1644,10 +2065,8 @@ search_again:
1644 item_end += 2065 item_end +=
1645 btrfs_file_extent_num_bytes(leaf, fi); 2066 btrfs_file_extent_num_bytes(leaf, fi);
1646 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2067 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1647 struct btrfs_item *item = btrfs_item_nr(leaf,
1648 path->slots[0]);
1649 item_end += btrfs_file_extent_inline_len(leaf, 2068 item_end += btrfs_file_extent_inline_len(leaf,
1650 item); 2069 fi);
1651 } 2070 }
1652 item_end--; 2071 item_end--;
1653 } 2072 }
@@ -1715,7 +2134,14 @@ search_again:
1715 root_owner = btrfs_header_owner(leaf); 2134 root_owner = btrfs_header_owner(leaf);
1716 } 2135 }
1717 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2136 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1718 if (!del_item) { 2137 /*
2138 * we can't truncate inline items that have had
2139 * special encodings
2140 */
2141 if (!del_item &&
2142 btrfs_file_extent_compression(leaf, fi) == 0 &&
2143 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2144 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
1719 u32 size = new_size - found_key.offset; 2145 u32 size = new_size - found_key.offset;
1720 2146
1721 if (root->ref_cows) { 2147 if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1926 err = btrfs_insert_file_extent(trans, root, 2352 err = btrfs_insert_file_extent(trans, root,
1927 inode->i_ino, 2353 inode->i_ino,
1928 hole_start, 0, 0, 2354 hole_start, 0, 0,
1929 hole_size, 0); 2355 hole_size, 0, hole_size,
2356 0, 0, 0);
1930 btrfs_drop_extent_cache(inode, hole_start, 2357 btrfs_drop_extent_cache(inode, hole_start,
1931 (u64)-1, 0); 2358 (u64)-1, 0);
1932 btrfs_check_file(root, inode); 2359 btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
2894 start_diff = map_start - em->start; 3321 start_diff = map_start - em->start;
2895 em->start = map_start; 3322 em->start = map_start;
2896 em->len = map_len; 3323 em->len = map_len;
2897 if (em->block_start < EXTENT_MAP_LAST_BYTE) 3324 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3325 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2898 em->block_start += start_diff; 3326 em->block_start += start_diff;
3327 em->block_len -= start_diff;
3328 }
2899 return add_extent_mapping(em_tree, em); 3329 return add_extent_mapping(em_tree, em);
2900} 3330}
2901 3331
3332static noinline int uncompress_inline(struct btrfs_path *path,
3333 struct inode *inode, struct page *page,
3334 size_t pg_offset, u64 extent_offset,
3335 struct btrfs_file_extent_item *item)
3336{
3337 int ret;
3338 struct extent_buffer *leaf = path->nodes[0];
3339 char *tmp;
3340 size_t max_size;
3341 unsigned long inline_size;
3342 unsigned long ptr;
3343
3344 WARN_ON(pg_offset != 0);
3345 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3346 inline_size = btrfs_file_extent_inline_item_len(leaf,
3347 btrfs_item_nr(leaf, path->slots[0]));
3348 tmp = kmalloc(inline_size, GFP_NOFS);
3349 ptr = btrfs_file_extent_inline_start(item);
3350
3351 read_extent_buffer(leaf, tmp, ptr, inline_size);
3352
3353 max_size = min(PAGE_CACHE_SIZE, max_size);
3354 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3355 inline_size, max_size);
3356 if (ret) {
3357 char *kaddr = kmap_atomic(page, KM_USER0);
3358 unsigned long copy_size = min_t(u64,
3359 PAGE_CACHE_SIZE - pg_offset,
3360 max_size - extent_offset);
3361 memset(kaddr + pg_offset, 0, copy_size);
3362 kunmap_atomic(kaddr, KM_USER0);
3363 }
3364 kfree(tmp);
3365 return 0;
3366}
3367
2902/* 3368/*
2903 * a bit scary, this does extent mapping from logical file offset to the disk. 3369 * a bit scary, this does extent mapping from logical file offset to the disk.
2904 * the ugly parts come from merging extents from the disk with the 3370 * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2927 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3393 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2928 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3394 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2929 struct btrfs_trans_handle *trans = NULL; 3395 struct btrfs_trans_handle *trans = NULL;
3396 int compressed;
2930 3397
2931again: 3398again:
2932 spin_lock(&em_tree->lock); 3399 spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
2951 em->bdev = root->fs_info->fs_devices->latest_bdev; 3418 em->bdev = root->fs_info->fs_devices->latest_bdev;
2952 em->start = EXTENT_MAP_HOLE; 3419 em->start = EXTENT_MAP_HOLE;
2953 em->len = (u64)-1; 3420 em->len = (u64)-1;
3421 em->block_len = (u64)-1;
2954 3422
2955 if (!path) { 3423 if (!path) {
2956 path = btrfs_alloc_path(); 3424 path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
2983 3451
2984 found_type = btrfs_file_extent_type(leaf, item); 3452 found_type = btrfs_file_extent_type(leaf, item);
2985 extent_start = found_key.offset; 3453 extent_start = found_key.offset;
3454 compressed = btrfs_file_extent_compression(leaf, item);
2986 if (found_type == BTRFS_FILE_EXTENT_REG) { 3455 if (found_type == BTRFS_FILE_EXTENT_REG) {
2987 extent_end = extent_start + 3456 extent_end = extent_start +
2988 btrfs_file_extent_num_bytes(leaf, item); 3457 btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
3005 em->block_start = EXTENT_MAP_HOLE; 3474 em->block_start = EXTENT_MAP_HOLE;
3006 goto insert; 3475 goto insert;
3007 } 3476 }
3008 bytenr += btrfs_file_extent_offset(leaf, item);
3009 em->block_start = bytenr;
3010 em->start = extent_start; 3477 em->start = extent_start;
3011 em->len = extent_end - extent_start; 3478 em->len = extent_end - extent_start;
3479 if (compressed) {
3480 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3481 em->block_start = bytenr;
3482 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
3483 item);
3484 } else {
3485 bytenr += btrfs_file_extent_offset(leaf, item);
3486 em->block_start = bytenr;
3487 em->block_len = em->len;
3488 }
3012 goto insert; 3489 goto insert;
3013 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 3490 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3014 u64 page_start; 3491 u64 page_start;
@@ -3018,8 +3495,7 @@ again:
3018 size_t extent_offset; 3495 size_t extent_offset;
3019 size_t copy_size; 3496 size_t copy_size;
3020 3497
3021 size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, 3498 size = btrfs_file_extent_inline_len(leaf, item);
3022 path->slots[0]));
3023 extent_end = (extent_start + size + root->sectorsize - 1) & 3499 extent_end = (extent_start + size + root->sectorsize - 1) &
3024 ~((u64)root->sectorsize - 1); 3500 ~((u64)root->sectorsize - 1);
3025 if (start < extent_start || start >= extent_end) { 3501 if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
3035 } 3511 }
3036 em->block_start = EXTENT_MAP_INLINE; 3512 em->block_start = EXTENT_MAP_INLINE;
3037 3513
3038 if (!page) { 3514 if (!page || create) {
3039 em->start = extent_start; 3515 em->start = extent_start;
3040 em->len = size; 3516 em->len = (size + root->sectorsize - 1) &
3517 ~((u64)root->sectorsize - 1);
3041 goto out; 3518 goto out;
3042 } 3519 }
3043 3520
@@ -3048,11 +3525,22 @@ again:
3048 em->start = extent_start + extent_offset; 3525 em->start = extent_start + extent_offset;
3049 em->len = (copy_size + root->sectorsize - 1) & 3526 em->len = (copy_size + root->sectorsize - 1) &
3050 ~((u64)root->sectorsize - 1); 3527 ~((u64)root->sectorsize - 1);
3051 map = kmap(page); 3528 if (compressed)
3529 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3052 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 3530 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
3053 if (create == 0 && !PageUptodate(page)) { 3531 if (create == 0 && !PageUptodate(page)) {
3054 read_extent_buffer(leaf, map + pg_offset, ptr, 3532 if (btrfs_file_extent_compression(leaf, item) ==
3055 copy_size); 3533 BTRFS_COMPRESS_ZLIB) {
3534 ret = uncompress_inline(path, inode, page,
3535 pg_offset,
3536 extent_offset, item);
3537 BUG_ON(ret);
3538 } else {
3539 map = kmap(page);
3540 read_extent_buffer(leaf, map + pg_offset, ptr,
3541 copy_size);
3542 kunmap(page);
3543 }
3056 flush_dcache_page(page); 3544 flush_dcache_page(page);
3057 } else if (create && PageUptodate(page)) { 3545 } else if (create && PageUptodate(page)) {
3058 if (!trans) { 3546 if (!trans) {
@@ -3063,11 +3551,12 @@ again:
3063 trans = btrfs_join_transaction(root, 1); 3551 trans = btrfs_join_transaction(root, 1);
3064 goto again; 3552 goto again;
3065 } 3553 }
3554 map = kmap(page);
3066 write_extent_buffer(leaf, map + pg_offset, ptr, 3555 write_extent_buffer(leaf, map + pg_offset, ptr,
3067 copy_size); 3556 copy_size);
3557 kunmap(page);
3068 btrfs_mark_buffer_dirty(leaf); 3558 btrfs_mark_buffer_dirty(leaf);
3069 } 3559 }
3070 kunmap(page);
3071 set_extent_uptodate(io_tree, em->start, 3560 set_extent_uptodate(io_tree, em->start,
3072 extent_map_end(em) - 1, GFP_NOFS); 3561 extent_map_end(em) - 1, GFP_NOFS);
3073 goto insert; 3562 goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
3779 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 4268 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
3780 btrfs_set_file_extent_type(leaf, ei, 4269 btrfs_set_file_extent_type(leaf, ei,
3781 BTRFS_FILE_EXTENT_INLINE); 4270 BTRFS_FILE_EXTENT_INLINE);
4271 btrfs_set_file_extent_encryption(leaf, ei, 0);
4272 btrfs_set_file_extent_compression(leaf, ei, 0);
4273 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4274 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4275
3782 ptr = btrfs_file_extent_inline_start(ei); 4276 ptr = btrfs_file_extent_inline_start(ei);
3783 write_extent_buffer(leaf, symname, ptr, name_len); 4277 write_extent_buffer(leaf, symname, ptr, name_len);
3784 btrfs_mark_buffer_dirty(leaf); 4278 btrfs_mark_buffer_dirty(leaf);