aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/file.c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c263
1 files changed, 58 insertions, 205 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add2..0aa15436590e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
95 } 95 }
96} 96}
97 97
98/* this does all the hard work for inserting an inline extent into
99 * the btree. Any existing inline extent is extended as required to make room,
100 * otherwise things are inserted as required into the btree
101 */
102static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root, struct inode *inode,
104 u64 offset, size_t size,
105 struct page **pages, size_t page_offset,
106 int num_pages)
107{
108 struct btrfs_key key;
109 struct btrfs_path *path;
110 struct extent_buffer *leaf;
111 char *kaddr;
112 unsigned long ptr;
113 struct btrfs_file_extent_item *ei;
114 struct page *page;
115 u32 datasize;
116 int err = 0;
117 int ret;
118 int i;
119 ssize_t cur_size;
120
121 path = btrfs_alloc_path();
122 if (!path)
123 return -ENOMEM;
124
125 btrfs_set_trans_block_group(trans, inode);
126
127 key.objectid = inode->i_ino;
128 key.offset = offset;
129 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
130
131 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
132 if (ret < 0) {
133 err = ret;
134 goto fail;
135 }
136 if (ret == 1) {
137 struct btrfs_key found_key;
138
139 if (path->slots[0] == 0)
140 goto insert;
141
142 path->slots[0]--;
143 leaf = path->nodes[0];
144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
145
146 if (found_key.objectid != inode->i_ino)
147 goto insert;
148
149 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
150 goto insert;
151 ei = btrfs_item_ptr(leaf, path->slots[0],
152 struct btrfs_file_extent_item);
153
154 if (btrfs_file_extent_type(leaf, ei) !=
155 BTRFS_FILE_EXTENT_INLINE) {
156 goto insert;
157 }
158 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
159 ret = 0;
160 }
161 if (ret == 0) {
162 u32 found_size;
163 u64 found_end;
164
165 leaf = path->nodes[0];
166 ei = btrfs_item_ptr(leaf, path->slots[0],
167 struct btrfs_file_extent_item);
168
169 if (btrfs_file_extent_type(leaf, ei) !=
170 BTRFS_FILE_EXTENT_INLINE) {
171 err = ret;
172 btrfs_print_leaf(root, leaf);
173 printk("found wasn't inline offset %Lu inode %lu\n",
174 offset, inode->i_ino);
175 goto fail;
176 }
177 found_size = btrfs_file_extent_inline_len(leaf,
178 btrfs_item_nr(leaf, path->slots[0]));
179 found_end = key.offset + found_size;
180
181 if (found_end < offset + size) {
182 btrfs_release_path(root, path);
183 ret = btrfs_search_slot(trans, root, &key, path,
184 offset + size - found_end, 1);
185 BUG_ON(ret != 0);
186
187 ret = btrfs_extend_item(trans, root, path,
188 offset + size - found_end);
189 if (ret) {
190 err = ret;
191 goto fail;
192 }
193 leaf = path->nodes[0];
194 ei = btrfs_item_ptr(leaf, path->slots[0],
195 struct btrfs_file_extent_item);
196 inode_add_bytes(inode, offset + size - found_end);
197 }
198 if (found_end < offset) {
199 ptr = btrfs_file_extent_inline_start(ei) + found_size;
200 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
201 }
202 } else {
203insert:
204 btrfs_release_path(root, path);
205 datasize = offset + size - key.offset;
206 inode_add_bytes(inode, datasize);
207 datasize = btrfs_file_extent_calc_inline_size(datasize);
208 ret = btrfs_insert_empty_item(trans, root, path, &key,
209 datasize);
210 if (ret) {
211 err = ret;
212 printk("got bad ret %d\n", ret);
213 goto fail;
214 }
215 leaf = path->nodes[0];
216 ei = btrfs_item_ptr(leaf, path->slots[0],
217 struct btrfs_file_extent_item);
218 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
219 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
220 }
221 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
222
223 cur_size = size;
224 i = 0;
225 while (size > 0) {
226 page = pages[i];
227 kaddr = kmap_atomic(page, KM_USER0);
228 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
229 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
230 kunmap_atomic(kaddr, KM_USER0);
231 page_offset = 0;
232 ptr += cur_size;
233 size -= cur_size;
234 if (i >= num_pages) {
235 printk("i %d num_pages %d\n", i, num_pages);
236 }
237 i++;
238 }
239 btrfs_mark_buffer_dirty(leaf);
240fail:
241 btrfs_free_path(path);
242 return err;
243}
244
245/* 98/*
246 * after copy_from_user, pages need to be dirtied and we need to make 99 * after copy_from_user, pages need to be dirtied and we need to make
247 * sure holes are created between the current EOF and the start of 100 * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
267 u64 start_pos; 120 u64 start_pos;
268 u64 end_of_last_block; 121 u64 end_of_last_block;
269 u64 end_pos = pos + write_bytes; 122 u64 end_pos = pos + write_bytes;
270 u64 inline_size;
271 int did_inline = 0;
272 loff_t isize = i_size_read(inode); 123 loff_t isize = i_size_read(inode);
273 124
274 start_pos = pos & ~((u64)root->sectorsize - 1); 125 start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
314 err = btrfs_insert_file_extent(trans, root, 165 err = btrfs_insert_file_extent(trans, root,
315 inode->i_ino, 166 inode->i_ino,
316 last_pos_in_file, 167 last_pos_in_file,
317 0, 0, hole_size, 0); 168 0, 0, hole_size, 0,
169 hole_size, 0, 0, 0);
318 btrfs_drop_extent_cache(inode, last_pos_in_file, 170 btrfs_drop_extent_cache(inode, last_pos_in_file,
319 last_pos_in_file + hole_size - 1, 0); 171 last_pos_in_file + hole_size - 1, 0);
320 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 172 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
324 goto failed; 176 goto failed;
325 } 177 }
326 178
327 /* 179 /* check for reserved extents on each page, we don't want
328 * either allocate an extent for the new bytes or setup the key 180 * to reset the delalloc bit on things that already have
329 * to show we are doing inline data in the extent 181 * extents reserved.
330 */ 182 */
331 inline_size = end_pos; 183 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
332 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 184 for (i = 0; i < num_pages; i++) {
333 inline_size > root->fs_info->max_inline || 185 struct page *p = pages[i];
334 (inline_size & (root->sectorsize -1)) == 0 || 186 SetPageUptodate(p);
335 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { 187 ClearPageChecked(p);
336 /* check for reserved extents on each page, we don't want 188 set_page_dirty(p);
337 * to reset the delalloc bit on things that already have
338 * extents reserved.
339 */
340 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
341 for (i = 0; i < num_pages; i++) {
342 struct page *p = pages[i];
343 SetPageUptodate(p);
344 ClearPageChecked(p);
345 set_page_dirty(p);
346 }
347 } else {
348 u64 aligned_end;
349 /* step one, delete the existing extents in this range */
350 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
351 ~((u64)root->sectorsize - 1);
352 mutex_lock(&BTRFS_I(inode)->extent_mutex);
353 err = btrfs_drop_extents(trans, root, inode, start_pos,
354 aligned_end, aligned_end, &hint_byte);
355 if (err)
356 goto failed;
357 if (isize > inline_size)
358 inline_size = min_t(u64, isize, aligned_end);
359 inline_size -= start_pos;
360 err = insert_inline_extent(trans, root, inode, start_pos,
361 inline_size, pages, 0, num_pages);
362 btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
363 BUG_ON(err);
364 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
365
366 /*
367 * an ugly way to do all the prop accounting around
368 * the page bits and mapping tags
369 */
370 set_page_writeback(pages[0]);
371 end_page_writeback(pages[0]);
372 did_inline = 1;
373 } 189 }
374 if (end_pos > isize) { 190 if (end_pos > isize) {
375 i_size_write(inode, end_pos); 191 i_size_write(inode, end_pos);
376 if (did_inline)
377 BTRFS_I(inode)->disk_i_size = end_pos;
378 btrfs_update_inode(trans, root, inode); 192 btrfs_update_inode(trans, root, inode);
379 } 193 }
380failed: 194failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
399 int ret; 213 int ret;
400 int testend = 1; 214 int testend = 1;
401 unsigned long flags; 215 unsigned long flags;
216 int compressed = 0;
402 217
403 WARN_ON(end < start); 218 WARN_ON(end < start);
404 if (end == (u64)-1) { 219 if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
434 free_extent_map(em); 249 free_extent_map(em);
435 continue; 250 continue;
436 } 251 }
252 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
437 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 253 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
438 remove_extent_mapping(em_tree, em); 254 remove_extent_mapping(em_tree, em);
439 255
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
442 split->start = em->start; 258 split->start = em->start;
443 split->len = start - em->start; 259 split->len = start - em->start;
444 split->block_start = em->block_start; 260 split->block_start = em->block_start;
261
262 if (compressed)
263 split->block_len = em->block_len;
264 else
265 split->block_len = split->len;
266
445 split->bdev = em->bdev; 267 split->bdev = em->bdev;
446 split->flags = flags; 268 split->flags = flags;
447 ret = add_extent_mapping(em_tree, split); 269 ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
459 split->bdev = em->bdev; 281 split->bdev = em->bdev;
460 split->flags = flags; 282 split->flags = flags;
461 283
462 split->block_start = em->block_start + diff; 284 if (compressed) {
285 split->block_len = em->block_len;
286 split->block_start = em->block_start;
287 } else {
288 split->block_len = split->len;
289 split->block_start = em->block_start + diff;
290 }
463 291
464 ret = add_extent_mapping(em_tree, split); 292 ret = add_extent_mapping(em_tree, split);
465 BUG_ON(ret); 293 BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
533 struct btrfs_item *item; 361 struct btrfs_item *item;
534 item = btrfs_item_nr(leaf, slot); 362 item = btrfs_item_nr(leaf, slot);
535 extent_end = found_key.offset + 363 extent_end = found_key.offset +
536 btrfs_file_extent_inline_len(leaf, item); 364 btrfs_file_extent_inline_len(leaf, extent);
537 extent_end = (extent_end + root->sectorsize - 1) & 365 extent_end = (extent_end + root->sectorsize - 1) &
538 ~((u64)root->sectorsize -1 ); 366 ~((u64)root->sectorsize -1 );
539 } 367 }
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
573 u64 extent_end = 0; 401 u64 extent_end = 0;
574 u64 search_start = start; 402 u64 search_start = start;
575 u64 leaf_start; 403 u64 leaf_start;
404 u64 ram_bytes = 0;
405 u8 compression = 0;
406 u8 encryption = 0;
407 u16 other_encoding = 0;
576 u64 root_gen; 408 u64 root_gen;
577 u64 root_owner; 409 u64 root_owner;
578 struct extent_buffer *leaf; 410 struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
589 int recow; 421 int recow;
590 int ret; 422 int ret;
591 423
424 inline_limit = 0;
592 btrfs_drop_extent_cache(inode, start, end - 1, 0); 425 btrfs_drop_extent_cache(inode, start, end - 1, 0);
593 426
594 path = btrfs_alloc_path(); 427 path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
637 extent = btrfs_item_ptr(leaf, slot, 470 extent = btrfs_item_ptr(leaf, slot,
638 struct btrfs_file_extent_item); 471 struct btrfs_file_extent_item);
639 found_type = btrfs_file_extent_type(leaf, extent); 472 found_type = btrfs_file_extent_type(leaf, extent);
473 compression = btrfs_file_extent_compression(leaf,
474 extent);
475 encryption = btrfs_file_extent_encryption(leaf,
476 extent);
477 other_encoding = btrfs_file_extent_other_encoding(leaf,
478 extent);
640 if (found_type == BTRFS_FILE_EXTENT_REG) { 479 if (found_type == BTRFS_FILE_EXTENT_REG) {
641 extent_end = 480 extent_end =
642 btrfs_file_extent_disk_bytenr(leaf, 481 btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
646 485
647 extent_end = key.offset + 486 extent_end = key.offset +
648 btrfs_file_extent_num_bytes(leaf, extent); 487 btrfs_file_extent_num_bytes(leaf, extent);
488 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
489 extent);
649 found_extent = 1; 490 found_extent = 1;
650 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 491 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
651 struct btrfs_item *item;
652 item = btrfs_item_nr(leaf, slot);
653 found_inline = 1; 492 found_inline = 1;
654 extent_end = key.offset + 493 extent_end = key.offset +
655 btrfs_file_extent_inline_len(leaf, item); 494 btrfs_file_extent_inline_len(leaf, extent);
656 } 495 }
657 } else { 496 } else {
658 extent_end = search_start; 497 extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
680 search_start = (extent_end + mask) & ~mask; 519 search_start = (extent_end + mask) & ~mask;
681 } else 520 } else
682 search_start = extent_end; 521 search_start = extent_end;
683 if (end <= extent_end && start >= key.offset && found_inline) { 522
523 if (end <= extent_end && start >= key.offset && found_inline)
684 *hint_byte = EXTENT_MAP_INLINE; 524 *hint_byte = EXTENT_MAP_INLINE;
685 goto out;
686 }
687 525
688 if (found_extent) { 526 if (found_extent) {
689 read_extent_buffer(leaf, &old, (unsigned long)extent, 527 read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
770 write_extent_buffer(leaf, &old, 608 write_extent_buffer(leaf, &old,
771 (unsigned long)extent, sizeof(old)); 609 (unsigned long)extent, sizeof(old));
772 610
611 btrfs_set_file_extent_compression(leaf, extent,
612 compression);
613 btrfs_set_file_extent_encryption(leaf, extent,
614 encryption);
615 btrfs_set_file_extent_other_encoding(leaf, extent,
616 other_encoding);
773 btrfs_set_file_extent_offset(leaf, extent, 617 btrfs_set_file_extent_offset(leaf, extent,
774 le64_to_cpu(old.offset) + end - key.offset); 618 le64_to_cpu(old.offset) + end - key.offset);
775 WARN_ON(le64_to_cpu(old.num_bytes) < 619 WARN_ON(le64_to_cpu(old.num_bytes) <
776 (extent_end - end)); 620 (extent_end - end));
777 btrfs_set_file_extent_num_bytes(leaf, extent, 621 btrfs_set_file_extent_num_bytes(leaf, extent,
778 extent_end - end); 622 extent_end - end);
623
624 /*
625 * set the ram bytes to the size of the full extent
626 * before splitting. This is a worst case flag,
627 * but its the best we can do because we don't know
628 * how splitting affects compression
629 */
630 btrfs_set_file_extent_ram_bytes(leaf, extent,
631 ram_bytes);
779 btrfs_set_file_extent_type(leaf, extent, 632 btrfs_set_file_extent_type(leaf, extent,
780 BTRFS_FILE_EXTENT_REG); 633 BTRFS_FILE_EXTENT_REG);
781 634