aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/zlib.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/zlib.c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/zlib.c')
-rw-r--r--fs/btrfs/zlib.c637
1 files changed, 637 insertions, 0 deletions
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33
34/* Plan: call deflate() with avail_in == *sourcelen,
35 avail_out = *dstlen - 12 and flush == Z_FINISH.
36 If it doesn't manage to finish, call it again with
37 avail_in == 0 and avail_out set to the remaining 12
38 bytes for it to clean up.
39 Q: Is 12 bytes sufficient?
40*/
41#define STREAM_END_SPACE 12
42
43struct workspace {
44 z_stream inf_strm;
45 z_stream def_strm;
46 char *buf;
47 struct list_head list;
48};
49
50static LIST_HEAD(idle_workspace);
51static DEFINE_SPINLOCK(workspace_lock);
52static unsigned long num_workspace;
53static atomic_t alloc_workspace = ATOMIC_INIT(0);
54static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
55
56/*
57 * this finds an available zlib workspace or allocates a new one
58 * NULL or an ERR_PTR is returned if things go bad.
59 */
60static struct workspace *find_zlib_workspace(void)
61{
62 struct workspace *workspace;
63 int ret;
64 int cpus = num_online_cpus();
65
66again:
67 spin_lock(&workspace_lock);
68 if (!list_empty(&idle_workspace)) {
69 workspace = list_entry(idle_workspace.next, struct workspace,
70 list);
71 list_del(&workspace->list);
72 num_workspace--;
73 spin_unlock(&workspace_lock);
74 return workspace;
75
76 }
77 spin_unlock(&workspace_lock);
78 if (atomic_read(&alloc_workspace) > cpus) {
79 DEFINE_WAIT(wait);
80 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
81 if (atomic_read(&alloc_workspace) > cpus)
82 schedule();
83 finish_wait(&workspace_wait, &wait);
84 goto again;
85 }
86 atomic_inc(&alloc_workspace);
87 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
88 if (!workspace) {
89 ret = -ENOMEM;
90 goto fail;
91 }
92
93 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
94 if (!workspace->def_strm.workspace) {
95 ret = -ENOMEM;
96 goto fail;
97 }
98 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
99 if (!workspace->inf_strm.workspace) {
100 ret = -ENOMEM;
101 goto fail_inflate;
102 }
103 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
104 if (!workspace->buf) {
105 ret = -ENOMEM;
106 goto fail_kmalloc;
107 }
108 return workspace;
109
110fail_kmalloc:
111 vfree(workspace->inf_strm.workspace);
112fail_inflate:
113 vfree(workspace->def_strm.workspace);
114fail:
115 kfree(workspace);
116 atomic_dec(&alloc_workspace);
117 wake_up(&workspace_wait);
118 return ERR_PTR(ret);
119}
120
121/*
122 * put a workspace struct back on the list or free it if we have enough
123 * idle ones sitting around
124 */
125static int free_workspace(struct workspace *workspace)
126{
127 spin_lock(&workspace_lock);
128 if (num_workspace < num_online_cpus()) {
129 list_add_tail(&workspace->list, &idle_workspace);
130 num_workspace++;
131 spin_unlock(&workspace_lock);
132 if (waitqueue_active(&workspace_wait))
133 wake_up(&workspace_wait);
134 return 0;
135 }
136 spin_unlock(&workspace_lock);
137 vfree(workspace->def_strm.workspace);
138 vfree(workspace->inf_strm.workspace);
139 kfree(workspace->buf);
140 kfree(workspace);
141
142 atomic_dec(&alloc_workspace);
143 if (waitqueue_active(&workspace_wait))
144 wake_up(&workspace_wait);
145 return 0;
146}
147
148/*
149 * cleanup function for module exit
150 */
151static void free_workspaces(void)
152{
153 struct workspace *workspace;
154 while(!list_empty(&idle_workspace)) {
155 workspace = list_entry(idle_workspace.next, struct workspace,
156 list);
157 list_del(&workspace->list);
158 vfree(workspace->def_strm.workspace);
159 vfree(workspace->inf_strm.workspace);
160 kfree(workspace->buf);
161 kfree(workspace);
162 atomic_dec(&alloc_workspace);
163 }
164}
165
166/*
167 * given an address space and start/len, compress the bytes.
168 *
169 * pages are allocated to hold the compressed result and stored
170 * in 'pages'
171 *
172 * out_pages is used to return the number of pages allocated. There
173 * may be pages allocated even if we return an error
174 *
175 * total_in is used to return the number of bytes actually read. It
176 * may be smaller then len if we had to exit early because we
177 * ran out of room in the pages array or because we cross the
178 * max_out threshold.
179 *
180 * total_out is used to return the total number of compressed bytes
181 *
182 * max_out tells us the max number of bytes that we're allowed to
183 * stuff into pages
184 */
185int btrfs_zlib_compress_pages(struct address_space *mapping,
186 u64 start, unsigned long len,
187 struct page **pages,
188 unsigned long nr_dest_pages,
189 unsigned long *out_pages,
190 unsigned long *total_in,
191 unsigned long *total_out,
192 unsigned long max_out)
193{
194 int ret;
195 struct workspace *workspace;
196 char *data_in;
197 char *cpage_out;
198 int nr_pages = 0;
199 struct page *in_page = NULL;
200 struct page *out_page = NULL;
201 int out_written = 0;
202 int in_read = 0;
203 unsigned long bytes_left;
204
205 *out_pages = 0;
206 *total_out = 0;
207 *total_in = 0;
208
209 workspace = find_zlib_workspace();
210 if (!workspace)
211 return -1;
212
213 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
214 printk(KERN_WARNING "deflateInit failed\n");
215 ret = -1;
216 goto out;
217 }
218
219 workspace->def_strm.total_in = 0;
220 workspace->def_strm.total_out = 0;
221
222 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
223 data_in = kmap(in_page);
224
225 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
226 cpage_out = kmap(out_page);
227 pages[0] = out_page;
228 nr_pages = 1;
229
230 workspace->def_strm.next_in = data_in;
231 workspace->def_strm.next_out = cpage_out;
232 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
233 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
234
235 out_written = 0;
236 in_read = 0;
237
238 while (workspace->def_strm.total_in < len) {
239 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
240 if (ret != Z_OK) {
241 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
242 ret);
243 zlib_deflateEnd(&workspace->def_strm);
244 ret = -1;
245 goto out;
246 }
247
248 /* we're making it bigger, give up */
249 if (workspace->def_strm.total_in > 8192 &&
250 workspace->def_strm.total_in <
251 workspace->def_strm.total_out) {
252 ret = -1;
253 goto out;
254 }
255 /* we need another page for writing out. Test this
256 * before the total_in so we will pull in a new page for
257 * the stream end if required
258 */
259 if (workspace->def_strm.avail_out == 0) {
260 kunmap(out_page);
261 if (nr_pages == nr_dest_pages) {
262 out_page = NULL;
263 ret = -1;
264 goto out;
265 }
266 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
267 cpage_out = kmap(out_page);
268 pages[nr_pages] = out_page;
269 nr_pages++;
270 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
271 workspace->def_strm.next_out = cpage_out;
272 }
273 /* we're all done */
274 if (workspace->def_strm.total_in >= len)
275 break;
276
277 /* we've read in a full page, get a new one */
278 if (workspace->def_strm.avail_in == 0) {
279 if (workspace->def_strm.total_out > max_out)
280 break;
281
282 bytes_left = len - workspace->def_strm.total_in;
283 kunmap(in_page);
284 page_cache_release(in_page);
285
286 start += PAGE_CACHE_SIZE;
287 in_page = find_get_page(mapping,
288 start >> PAGE_CACHE_SHIFT);
289 data_in = kmap(in_page);
290 workspace->def_strm.avail_in = min(bytes_left,
291 PAGE_CACHE_SIZE);
292 workspace->def_strm.next_in = data_in;
293 }
294 }
295 workspace->def_strm.avail_in = 0;
296 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
297 zlib_deflateEnd(&workspace->def_strm);
298
299 if (ret != Z_STREAM_END) {
300 ret = -1;
301 goto out;
302 }
303
304 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
305 ret = -1;
306 goto out;
307 }
308
309 ret = 0;
310 *total_out = workspace->def_strm.total_out;
311 *total_in = workspace->def_strm.total_in;
312out:
313 *out_pages = nr_pages;
314 if (out_page)
315 kunmap(out_page);
316
317 if (in_page) {
318 kunmap(in_page);
319 page_cache_release(in_page);
320 }
321 free_workspace(workspace);
322 return ret;
323}
324
325/*
326 * pages_in is an array of pages with compressed data.
327 *
328 * disk_start is the starting logical offset of this array in the file
329 *
330 * bvec is a bio_vec of pages from the file that we want to decompress into
331 *
332 * vcnt is the count of pages in the biovec
333 *
334 * srclen is the number of bytes in pages_in
335 *
336 * The basic idea is that we have a bio that was created by readpages.
337 * The pages in the bio are for the uncompressed data, and they may not
338 * be contiguous. They all correspond to the range of bytes covered by
339 * the compressed extent.
340 */
341int btrfs_zlib_decompress_biovec(struct page **pages_in,
342 u64 disk_start,
343 struct bio_vec *bvec,
344 int vcnt,
345 size_t srclen)
346{
347 int ret = 0;
348 int wbits = MAX_WBITS;
349 struct workspace *workspace;
350 char *data_in;
351 size_t total_out = 0;
352 unsigned long page_bytes_left;
353 unsigned long page_in_index = 0;
354 unsigned long page_out_index = 0;
355 struct page *page_out;
356 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
357 PAGE_CACHE_SIZE;
358 unsigned long buf_start;
359 unsigned long buf_offset;
360 unsigned long bytes;
361 unsigned long working_bytes;
362 unsigned long pg_offset;
363 unsigned long start_byte;
364 unsigned long current_buf_start;
365 char *kaddr;
366
367 workspace = find_zlib_workspace();
368 if (!workspace)
369 return -ENOMEM;
370
371 data_in = kmap(pages_in[page_in_index]);
372 workspace->inf_strm.next_in = data_in;
373 workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
374 workspace->inf_strm.total_in = 0;
375
376 workspace->inf_strm.total_out = 0;
377 workspace->inf_strm.next_out = workspace->buf;
378 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
379 page_out = bvec[page_out_index].bv_page;
380 page_bytes_left = PAGE_CACHE_SIZE;
381 pg_offset = 0;
382
383 /* If it's deflate, and it's got no preset dictionary, then
384 we can tell zlib to skip the adler32 check. */
385 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
386 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
387 !(((data_in[0]<<8) + data_in[1]) % 31)) {
388
389 wbits = -((data_in[0] >> 4) + 8);
390 workspace->inf_strm.next_in += 2;
391 workspace->inf_strm.avail_in -= 2;
392 }
393
394 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
395 printk(KERN_WARNING "inflateInit failed\n");
396 ret = -1;
397 goto out;
398 }
399 while(workspace->inf_strm.total_in < srclen) {
400 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
401 if (ret != Z_OK && ret != Z_STREAM_END) {
402 break;
403 }
404
405 /*
406 * buf start is the byte offset we're of the start of
407 * our workspace buffer
408 */
409 buf_start = total_out;
410
411 /* total_out is the last byte of the workspace buffer */
412 total_out = workspace->inf_strm.total_out;
413
414 working_bytes = total_out - buf_start;
415
416 /*
417 * start byte is the first byte of the page we're currently
418 * copying into relative to the start of the compressed data.
419 */
420 start_byte = page_offset(page_out) - disk_start;
421
422 if (working_bytes == 0) {
423 /* we didn't make progress in this inflate
424 * call, we're done
425 */
426 if (ret != Z_STREAM_END)
427 ret = -1;
428 break;
429 }
430
431 /* we haven't yet hit data corresponding to this page */
432 if (total_out <= start_byte) {
433 goto next;
434 }
435
436 /*
437 * the start of the data we care about is offset into
438 * the middle of our working buffer
439 */
440 if (total_out > start_byte && buf_start < start_byte) {
441 buf_offset = start_byte - buf_start;
442 working_bytes -= buf_offset;
443 } else {
444 buf_offset = 0;
445 }
446 current_buf_start = buf_start;
447
448 /* copy bytes from the working buffer into the pages */
449 while(working_bytes > 0) {
450 bytes = min(PAGE_CACHE_SIZE - pg_offset,
451 PAGE_CACHE_SIZE - buf_offset);
452 bytes = min(bytes, working_bytes);
453 kaddr = kmap_atomic(page_out, KM_USER0);
454 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
455 bytes);
456 kunmap_atomic(kaddr, KM_USER0);
457 flush_dcache_page(page_out);
458
459 pg_offset += bytes;
460 page_bytes_left -= bytes;
461 buf_offset += bytes;
462 working_bytes -= bytes;
463 current_buf_start += bytes;
464
465 /* check if we need to pick another page */
466 if (page_bytes_left == 0) {
467 page_out_index++;
468 if (page_out_index >= vcnt) {
469 ret = 0;
470 goto done;
471 }
472 page_out = bvec[page_out_index].bv_page;
473 pg_offset = 0;
474 page_bytes_left = PAGE_CACHE_SIZE;
475 start_byte = page_offset(page_out) - disk_start;
476
477 /*
478 * make sure our new page is covered by this
479 * working buffer
480 */
481 if (total_out <= start_byte) {
482 goto next;
483 }
484
485 /* the next page in the biovec might not
486 * be adjacent to the last page, but it
487 * might still be found inside this working
488 * buffer. bump our offset pointer
489 */
490 if (total_out > start_byte &&
491 current_buf_start < start_byte) {
492 buf_offset = start_byte - buf_start;
493 working_bytes = total_out - start_byte;
494 current_buf_start = buf_start +
495 buf_offset;
496 }
497 }
498 }
499next:
500 workspace->inf_strm.next_out = workspace->buf;
501 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
502
503 if (workspace->inf_strm.avail_in == 0) {
504 unsigned long tmp;
505 kunmap(pages_in[page_in_index]);
506 page_in_index++;
507 if (page_in_index >= total_pages_in) {
508 data_in = NULL;
509 break;
510 }
511 data_in = kmap(pages_in[page_in_index]);
512 workspace->inf_strm.next_in = data_in;
513 tmp = srclen - workspace->inf_strm.total_in;
514 workspace->inf_strm.avail_in = min(tmp,
515 PAGE_CACHE_SIZE);
516 }
517 }
518 if (ret != Z_STREAM_END) {
519 ret = -1;
520 } else {
521 ret = 0;
522 }
523done:
524 zlib_inflateEnd(&workspace->inf_strm);
525 if (data_in)
526 kunmap(pages_in[page_in_index]);
527out:
528 free_workspace(workspace);
529 return ret;
530}
531
532/*
533 * a less complex decompression routine. Our compressed data fits in a
534 * single page, and we want to read a single page out of it.
535 * start_byte tells us the offset into the compressed data we're interested in
536 */
537int btrfs_zlib_decompress(unsigned char *data_in,
538 struct page *dest_page,
539 unsigned long start_byte,
540 size_t srclen, size_t destlen)
541{
542 int ret = 0;
543 int wbits = MAX_WBITS;
544 struct workspace *workspace;
545 unsigned long bytes_left = destlen;
546 unsigned long total_out = 0;
547 char *kaddr;
548
549 if (destlen > PAGE_CACHE_SIZE)
550 return -ENOMEM;
551
552 workspace = find_zlib_workspace();
553 if (!workspace)
554 return -ENOMEM;
555
556 workspace->inf_strm.next_in = data_in;
557 workspace->inf_strm.avail_in = srclen;
558 workspace->inf_strm.total_in = 0;
559
560 workspace->inf_strm.next_out = workspace->buf;
561 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
562 workspace->inf_strm.total_out = 0;
563 /* If it's deflate, and it's got no preset dictionary, then
564 we can tell zlib to skip the adler32 check. */
565 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
566 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
567 !(((data_in[0]<<8) + data_in[1]) % 31)) {
568
569 wbits = -((data_in[0] >> 4) + 8);
570 workspace->inf_strm.next_in += 2;
571 workspace->inf_strm.avail_in -= 2;
572 }
573
574 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
575 printk(KERN_WARNING "inflateInit failed\n");
576 ret = -1;
577 goto out;
578 }
579
580 while(bytes_left > 0) {
581 unsigned long buf_start;
582 unsigned long buf_offset;
583 unsigned long bytes;
584 unsigned long pg_offset = 0;
585
586 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
587 if (ret != Z_OK && ret != Z_STREAM_END) {
588 break;
589 }
590
591 buf_start = total_out;
592 total_out = workspace->inf_strm.total_out;
593
594 if (total_out == buf_start) {
595 ret = -1;
596 break;
597 }
598
599 if (total_out <= start_byte) {
600 goto next;
601 }
602
603 if (total_out > start_byte && buf_start < start_byte) {
604 buf_offset = start_byte - buf_start;
605 } else {
606 buf_offset = 0;
607 }
608
609 bytes = min(PAGE_CACHE_SIZE - pg_offset,
610 PAGE_CACHE_SIZE - buf_offset);
611 bytes = min(bytes, bytes_left);
612
613 kaddr = kmap_atomic(dest_page, KM_USER0);
614 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
615 kunmap_atomic(kaddr, KM_USER0);
616
617 pg_offset += bytes;
618 bytes_left -= bytes;
619next:
620 workspace->inf_strm.next_out = workspace->buf;
621 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
622 }
623 if (ret != Z_STREAM_END && bytes_left != 0) {
624 ret = -1;
625 } else {
626 ret = 0;
627 }
628 zlib_inflateEnd(&workspace->inf_strm);
629out:
630 free_workspace(workspace);
631 return ret;
632}
633
634void btrfs_zlib_exit(void)
635{
636 free_workspaces();
637}