aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/compression.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/compression.c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/compression.c')
-rw-r--r--fs/btrfs/compression.c454
1 files changed, 454 insertions, 0 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..c5470367ca5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include "ctree.h"
37#include "disk-io.h"
38#include "transaction.h"
39#include "btrfs_inode.h"
40#include "volumes.h"
41#include "ordered-data.h"
42#include "compat.h"
43#include "compression.h"
44#include "extent_io.h"
45#include "extent_map.h"
46
47struct compressed_bio {
48 /* number of bios pending for this compressed extent */
49 atomic_t pending_bios;
50
51 /* the pages with the compressed data on them */
52 struct page **compressed_pages;
53
54 /* inode that owns this data */
55 struct inode *inode;
56
57 /* starting offset in the inode for our pages */
58 u64 start;
59
60 /* number of bytes in the inode we're working on */
61 unsigned long len;
62
63 /* number of bytes on disk */
64 unsigned long compressed_len;
65
66 /* number of compressed pages in the array */
67 unsigned long nr_pages;
68
69 /* IO errors */
70 int errors;
71
72 /* for reads, this is the bio we are copying the data into */
73 struct bio *orig_bio;
74};
75
76static struct bio *compressed_bio_alloc(struct block_device *bdev,
77 u64 first_byte, gfp_t gfp_flags)
78{
79 struct bio *bio;
80 int nr_vecs;
81
82 nr_vecs = bio_get_nr_vecs(bdev);
83 bio = bio_alloc(gfp_flags, nr_vecs);
84
85 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
86 while (!bio && (nr_vecs /= 2))
87 bio = bio_alloc(gfp_flags, nr_vecs);
88 }
89
90 if (bio) {
91 bio->bi_size = 0;
92 bio->bi_bdev = bdev;
93 bio->bi_sector = first_byte >> 9;
94 }
95 return bio;
96}
97
98/* when we finish reading compressed pages from the disk, we
99 * decompress them and then run the bio end_io routines on the
100 * decompressed pages (in the inode address space).
101 *
102 * This allows the checksumming and other IO error handling routines
103 * to work normally
104 *
105 * The compressed pages are freed here, and it must be run
106 * in process context
107 */
108static void end_compressed_bio_read(struct bio *bio, int err)
109{
110 struct extent_io_tree *tree;
111 struct compressed_bio *cb = bio->bi_private;
112 struct inode *inode;
113 struct page *page;
114 unsigned long index;
115 int ret;
116
117 if (err)
118 cb->errors = 1;
119
120 /* if there are more bios still pending for this compressed
121 * extent, just exit
122 */
123 if (!atomic_dec_and_test(&cb->pending_bios))
124 goto out;
125
126 /* ok, we're the last bio for this extent, lets start
127 * the decompression.
128 */
129 inode = cb->inode;
130 tree = &BTRFS_I(inode)->io_tree;
131 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
132 cb->start,
133 cb->orig_bio->bi_io_vec,
134 cb->orig_bio->bi_vcnt,
135 cb->compressed_len);
136 if (ret)
137 cb->errors = 1;
138
139 /* release the compressed pages */
140 index = 0;
141 for (index = 0; index < cb->nr_pages; index++) {
142 page = cb->compressed_pages[index];
143 page->mapping = NULL;
144 page_cache_release(page);
145 }
146
147 /* do io completion on the original bio */
148 if (cb->errors)
149 bio_io_error(cb->orig_bio);
150 else
151 bio_endio(cb->orig_bio, 0);
152
153 /* finally free the cb struct */
154 kfree(cb->compressed_pages);
155 kfree(cb);
156out:
157 bio_put(bio);
158}
159
160/*
161 * Clear the writeback bits on all of the file
162 * pages for a compressed write
163 */
164static noinline int end_compressed_writeback(struct inode *inode, u64 start,
165 unsigned long ram_size)
166{
167 unsigned long index = start >> PAGE_CACHE_SHIFT;
168 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
169 struct page *pages[16];
170 unsigned long nr_pages = end_index - index + 1;
171 int i;
172 int ret;
173
174 while(nr_pages > 0) {
175 ret = find_get_pages_contig(inode->i_mapping, index,
176 min(nr_pages, ARRAY_SIZE(pages)), pages);
177 if (ret == 0) {
178 nr_pages -= 1;
179 index += 1;
180 continue;
181 }
182 for (i = 0; i < ret; i++) {
183 end_page_writeback(pages[i]);
184 page_cache_release(pages[i]);
185 }
186 nr_pages -= ret;
187 index += ret;
188 }
189 /* the inode may be gone now */
190 return 0;
191}
192
193/*
194 * do the cleanup once all the compressed pages hit the disk.
195 * This will clear writeback on the file pages and free the compressed
196 * pages.
197 *
198 * This also calls the writeback end hooks for the file pages so that
199 * metadata and checksums can be updated in the file.
200 */
201static void end_compressed_bio_write(struct bio *bio, int err)
202{
203 struct extent_io_tree *tree;
204 struct compressed_bio *cb = bio->bi_private;
205 struct inode *inode;
206 struct page *page;
207 unsigned long index;
208
209 if (err)
210 cb->errors = 1;
211
212 /* if there are more bios still pending for this compressed
213 * extent, just exit
214 */
215 if (!atomic_dec_and_test(&cb->pending_bios))
216 goto out;
217
218 /* ok, we're the last bio for this extent, step one is to
219 * call back into the FS and do all the end_io operations
220 */
221 inode = cb->inode;
222 tree = &BTRFS_I(inode)->io_tree;
223 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
224 cb->start,
225 cb->start + cb->len - 1,
226 NULL, 1);
227
228 end_compressed_writeback(inode, cb->start, cb->len);
229 /* note, our inode could be gone now */
230
231 /*
232 * release the compressed pages, these came from alloc_page and
233 * are not attached to the inode at all
234 */
235 index = 0;
236 for (index = 0; index < cb->nr_pages; index++) {
237 page = cb->compressed_pages[index];
238 page->mapping = NULL;
239 page_cache_release(page);
240 }
241
242 /* finally free the cb struct */
243 kfree(cb->compressed_pages);
244 kfree(cb);
245out:
246 bio_put(bio);
247}
248
249/*
250 * worker function to build and submit bios for previously compressed pages.
251 * The corresponding pages in the inode should be marked for writeback
252 * and the compressed pages should have a reference on them for dropping
253 * when the IO is complete.
254 *
255 * This also checksums the file bytes and gets things ready for
256 * the end io hooks.
257 */
258int btrfs_submit_compressed_write(struct inode *inode, u64 start,
259 unsigned long len, u64 disk_start,
260 unsigned long compressed_len,
261 struct page **compressed_pages,
262 unsigned long nr_pages)
263{
264 struct bio *bio = NULL;
265 struct btrfs_root *root = BTRFS_I(inode)->root;
266 struct compressed_bio *cb;
267 unsigned long bytes_left;
268 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
269 int page_index = 0;
270 struct page *page;
271 u64 first_byte = disk_start;
272 struct block_device *bdev;
273 int ret;
274
275 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
276 cb = kmalloc(sizeof(*cb), GFP_NOFS);
277 atomic_set(&cb->pending_bios, 0);
278 cb->errors = 0;
279 cb->inode = inode;
280 cb->start = start;
281 cb->len = len;
282 cb->compressed_pages = compressed_pages;
283 cb->compressed_len = compressed_len;
284 cb->orig_bio = NULL;
285 cb->nr_pages = nr_pages;
286
287 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
288
289 ret = btrfs_csum_file_bytes(root, inode, start, len);
290 BUG_ON(ret);
291
292 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
293 bio->bi_private = cb;
294 bio->bi_end_io = end_compressed_bio_write;
295 atomic_inc(&cb->pending_bios);
296
297 /* create and submit bios for the compressed pages */
298 bytes_left = compressed_len;
299 while(bytes_left > 0) {
300 page = compressed_pages[page_index];
301 page->mapping = inode->i_mapping;
302 if (bio->bi_size)
303 ret = io_tree->ops->merge_bio_hook(page, 0,
304 PAGE_CACHE_SIZE,
305 bio, 0);
306 else
307 ret = 0;
308
309 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
310 PAGE_CACHE_SIZE) {
311 bio_get(bio);
312
313 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
314 BUG_ON(ret);
315
316 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
317 BUG_ON(ret);
318
319 bio_put(bio);
320
321 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
322 atomic_inc(&cb->pending_bios);
323 bio->bi_private = cb;
324 bio->bi_end_io = end_compressed_bio_write;
325 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
326 }
327 page_index++;
328 bytes_left -= PAGE_CACHE_SIZE;
329 first_byte += PAGE_CACHE_SIZE;
330 }
331 bio_get(bio);
332
333 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
334 BUG_ON(ret);
335
336 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
337 BUG_ON(ret);
338
339 bio_put(bio);
340 return 0;
341}
342
343/*
344 * for a compressed read, the bio we get passed has all the inode pages
345 * in it. We don't actually do IO on those pages but allocate new ones
346 * to hold the compressed pages on disk.
347 *
348 * bio->bi_sector points to the compressed extent on disk
349 * bio->bi_io_vec points to all of the inode pages
350 * bio->bi_vcnt is a count of pages
351 *
352 * After the compressed pages are read, we copy the bytes into the
353 * bio we were passed and then call the bio end_io calls
354 */
355int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
356 int mirror_num, unsigned long bio_flags)
357{
358 struct extent_io_tree *tree;
359 struct extent_map_tree *em_tree;
360 struct compressed_bio *cb;
361 struct btrfs_root *root = BTRFS_I(inode)->root;
362 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
363 unsigned long compressed_len;
364 unsigned long nr_pages;
365 unsigned long page_index;
366 struct page *page;
367 struct block_device *bdev;
368 struct bio *comp_bio;
369 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
370 struct extent_map *em;
371 int ret;
372
373 tree = &BTRFS_I(inode)->io_tree;
374 em_tree = &BTRFS_I(inode)->extent_tree;
375
376 /* we need the actual starting offset of this extent in the file */
377 spin_lock(&em_tree->lock);
378 em = lookup_extent_mapping(em_tree,
379 page_offset(bio->bi_io_vec->bv_page),
380 PAGE_CACHE_SIZE);
381 spin_unlock(&em_tree->lock);
382
383 cb = kmalloc(sizeof(*cb), GFP_NOFS);
384 atomic_set(&cb->pending_bios, 0);
385 cb->errors = 0;
386 cb->inode = inode;
387
388 cb->start = em->start;
389 compressed_len = em->block_len;
390 free_extent_map(em);
391
392 cb->len = uncompressed_len;
393 cb->compressed_len = compressed_len;
394 cb->orig_bio = bio;
395
396 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
397 PAGE_CACHE_SIZE;
398 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
399 GFP_NOFS);
400 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
401
402 for (page_index = 0; page_index < nr_pages; page_index++) {
403 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
404 __GFP_HIGHMEM);
405 }
406 cb->nr_pages = nr_pages;
407
408 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
409 comp_bio->bi_private = cb;
410 comp_bio->bi_end_io = end_compressed_bio_read;
411 atomic_inc(&cb->pending_bios);
412
413 for (page_index = 0; page_index < nr_pages; page_index++) {
414 page = cb->compressed_pages[page_index];
415 page->mapping = inode->i_mapping;
416 if (comp_bio->bi_size)
417 ret = tree->ops->merge_bio_hook(page, 0,
418 PAGE_CACHE_SIZE,
419 comp_bio, 0);
420 else
421 ret = 0;
422
423 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
424 PAGE_CACHE_SIZE) {
425 bio_get(comp_bio);
426
427 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
431 BUG_ON(ret);
432
433 bio_put(comp_bio);
434
435 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
436 GFP_NOFS);
437 atomic_inc(&cb->pending_bios);
438 bio->bi_private = cb;
439 bio->bi_end_io = end_compressed_bio_write;
440 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
441 }
442 cur_disk_byte += PAGE_CACHE_SIZE;
443 }
444 bio_get(comp_bio);
445
446 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
447 BUG_ON(ret);
448
449 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
450 BUG_ON(ret);
451
452 bio_put(comp_bio);
453 return 0;
454}