Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2008-10-29 14:49:59 -0400
committer: Chris Mason <chris.mason@oracle.com> 2008-10-29 14:49:59 -0400
commit: c8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree: 873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/zlib.c
parent: 26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
1 files changed, 637 insertions, 0 deletions
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+/* Plan: call deflate() with avail_in == *sourcelen,
+        avail_out = *dstlen - 12 and flush == Z_FINISH.
+        If it doesn't manage to finish, call it again with
+        avail_in == 0 and avail_out set to the remaining 12
+        bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+struct workspace {
+        z_stream inf_strm;
+        z_stream def_strm;
+        char *buf;
+        struct list_head list;
+};
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+        struct workspace *workspace;
+        int ret;
+        int cpus = num_online_cpus();
+again:
+        spin_lock(&workspace_lock);
+        if (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                num_workspace--;
+                spin_unlock(&workspace_lock);
+                return workspace;
+        }
+        spin_unlock(&workspace_lock);
+        if (atomic_read(&alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(&alloc_workspace) > cpus)
+                        schedule();
+                finish_wait(&workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(&alloc_workspace);
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        if (!workspace->def_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+        if (!workspace->inf_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail_inflate;
+        }
+        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+        if (!workspace->buf) {
+                ret = -ENOMEM;
+                goto fail_kmalloc;
+        }
+        return workspace;
+fail_kmalloc:
+        vfree(workspace->inf_strm.workspace);
+fail_inflate:
+        vfree(workspace->def_strm.workspace);
+fail:
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        wake_up(&workspace_wait);
+        return ERR_PTR(ret);
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+        spin_lock(&workspace_lock);
+        if (num_workspace < num_online_cpus()) {
+                list_add_tail(&workspace->list, &idle_workspace);
+                num_workspace++;
+                spin_unlock(&workspace_lock);
+                if (waitqueue_active(&workspace_wait))
+                        wake_up(&workspace_wait);
+                return 0;
+        }
+        spin_unlock(&workspace_lock);
+        vfree(workspace->def_strm.workspace);
+        vfree(workspace->inf_strm.workspace);
+        kfree(workspace->buf);
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        if (waitqueue_active(&workspace_wait))
+                wake_up(&workspace_wait);
+        return 0;
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct workspace *workspace;
+        while(!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                vfree(workspace->def_strm.workspace);
+                vfree(workspace->inf_strm.workspace);
+                kfree(workspace->buf);
+                kfree(workspace);
+                atomic_dec(&alloc_workspace);
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        int ret;
+        struct workspace *workspace;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        int out_written = 0;
+        int in_read = 0;
+        unsigned long bytes_left;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -1;
+        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+                printk(KERN_WARNING "deflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        workspace->def_strm.total_in = 0;
+        workspace->def_strm.total_out = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        cpage_out = kmap(out_page);
+        pages[0] = out_page;
+        nr_pages = 1;
+        workspace->def_strm.next_in = data_in;
+        workspace->def_strm.next_out = cpage_out;
+        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+        out_written = 0;
+        in_read = 0;
+        while (workspace->def_strm.total_in < len) {
+                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+                if (ret != Z_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        zlib_deflateEnd(&workspace->def_strm);
+                        ret = -1;
+                        goto out;
+                }
+                /* we're making it bigger, give up */
+                if (workspace->def_strm.total_in > 8192 &&
+                    workspace->def_strm.total_in <
+                    workspace->def_strm.total_out) {
+                        ret = -1;
+                        goto out;
+                }
+                /* we need another page for writing out.  Test this
+                 * before the total_in so we will pull in a new page for
+                 * the stream end if required
+                 */
+                if (workspace->def_strm.avail_out == 0) {
+                        kunmap(out_page);
+                        if (nr_pages == nr_dest_pages) {
+                                out_page = NULL;
+                                ret = -1;
+                                goto out;
+                        }
+                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        cpage_out = kmap(out_page);
+                        pages[nr_pages] = out_page;
+                        nr_pages++;
+                        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                        workspace->def_strm.next_out = cpage_out;
+                }
+                /* we're all done */
+                if (workspace->def_strm.total_in >= len)
+                        break;
+                /* we've read in a full page, get a new one */
+                if (workspace->def_strm.avail_in == 0) {
+                        if (workspace->def_strm.total_out > max_out)
+                                break;
+                        bytes_left = len - workspace->def_strm.total_in;
+                        kunmap(in_page);
+                        page_cache_release(in_page);
+                        start += PAGE_CACHE_SIZE;
+                        in_page = find_get_page(mapping,
+                                                start >> PAGE_CACHE_SHIFT);
+                        data_in = kmap(in_page);
+                        workspace->def_strm.avail_in = min(bytes_left,
+                                                           PAGE_CACHE_SIZE);
+                        workspace->def_strm.next_in = data_in;
+                }
+        }
+        workspace->def_strm.avail_in = 0;
+        ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+        zlib_deflateEnd(&workspace->def_strm);
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+                goto out;
+        }
+        if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+                ret = -1;
+                goto out;
+        }
+        ret = 0;
+        *total_out = workspace->def_strm.total_out;
+        *total_in = workspace->def_strm.total_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        char *data_in;
+        size_t total_out = 0;
+        unsigned long page_bytes_left;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        struct page *page_out;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        unsigned long start_byte;
+        unsigned long current_buf_start;
+        char *kaddr;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        data_in = kmap(pages_in[page_in_index]);
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.total_out = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        page_out = bvec[page_out_index].bv_page;
+        page_bytes_left = PAGE_CACHE_SIZE;
+        pg_offset = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while(workspace->inf_strm.total_in < srclen) {
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END) {
+                        break;
+                }
+                /*
+                 * buf start is the byte offset we're of the start of
+                 * our workspace buffer
+                 */
+                buf_start = total_out;
+                /* total_out is the last byte of the workspace buffer */
+                total_out = workspace->inf_strm.total_out;
+                working_bytes = total_out - buf_start;
+                /*
+                 * start byte is the first byte of the page we're currently
+                 * copying into relative to the start of the compressed data.
+                 */
+                start_byte = page_offset(page_out) - disk_start;
+                if (working_bytes == 0) {
+                        /* we didn't make progress in this inflate
+                         * call, we're done
+                         */
+                        if (ret != Z_STREAM_END)
+                                ret = -1;
+                        break;
+                }
+                /* we haven't yet hit data corresponding to this page */
+                if (total_out <= start_byte) {
+                        goto next;
+                }
+                /*
+                 * the start of the data we care about is offset into
+                 * the middle of our working buffer
+                 */
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                        working_bytes -= buf_offset;
+                } else {
+                        buf_offset = 0;
+                }
+                current_buf_start = buf_start;
+                /* copy bytes from the working buffer into the pages */
+                while(working_bytes > 0) {
+                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                    PAGE_CACHE_SIZE - buf_offset);
+                        bytes = min(bytes, working_bytes);
+                        kaddr = kmap_atomic(page_out, KM_USER0);
+                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                               bytes);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page_out);
+                        pg_offset += bytes;
+                        page_bytes_left -= bytes;
+                        buf_offset += bytes;
+                        working_bytes -= bytes;
+                        current_buf_start += bytes;
+                        /* check if we need to pick another page */
+                        if (page_bytes_left == 0) {
+                                page_out_index++;
+                                if (page_out_index >= vcnt) {
+                                        ret = 0;
+                                        goto done;
+                                }
+                                page_out = bvec[page_out_index].bv_page;
+                                pg_offset = 0;
+                                page_bytes_left = PAGE_CACHE_SIZE;
+                                start_byte = page_offset(page_out) - disk_start;
+                                /*
+                                 * make sure our new page is covered by this
+                                 * working buffer
+                                 */
+                                if (total_out <= start_byte) {
+                                        goto next;
+                                }
+                                /* the next page in the biovec might not
+                                 * be adjacent to the last page, but it
+                                 * might still be found inside this working
+                                 * buffer.  bump our offset pointer
+                                 */
+                                if (total_out > start_byte &&
+                                    current_buf_start < start_byte) {
+                                        buf_offset = start_byte - buf_start;
+                                        working_bytes = total_out - start_byte;
+                                        current_buf_start = buf_start +
+                                                buf_offset;
+                                }
+                        }
+                }
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                if (workspace->inf_strm.avail_in == 0) {
+                        unsigned long tmp;
+                        kunmap(pages_in[page_in_index]);
+                        page_in_index++;
+                        if (page_in_index >= total_pages_in) {
+                                data_in = NULL;
+                                break;
+                        }
+                        data_in = kmap(pages_in[page_in_index]);
+                        workspace->inf_strm.next_in = data_in;
+                        tmp = srclen - workspace->inf_strm.total_in;
+                        workspace->inf_strm.avail_in = min(tmp,
+                                                           PAGE_CACHE_SIZE);
+                }
+        }
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+        } else {
+                ret = 0;
+        }
+done:
+        zlib_inflateEnd(&workspace->inf_strm);
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        unsigned long bytes_left = destlen;
+        unsigned long total_out = 0;
+        char *kaddr;
+        if (destlen > PAGE_CACHE_SIZE)
+                return -ENOMEM;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = srclen;
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->inf_strm.total_out = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while(bytes_left > 0) {
+                unsigned long buf_start;
+                unsigned long buf_offset;
+                unsigned long bytes;
+                unsigned long pg_offset = 0;
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END) {
+                        break;
+                }
+                buf_start = total_out;
+                total_out = workspace->inf_strm.total_out;
+                if (total_out == buf_start) {
+                        ret = -1;
+                        break;
+                }
+                if (total_out <= start_byte) {
+                        goto next;
+                }
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                } else {
+                        buf_offset = 0;
+                }
+                bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, bytes_left);
+                kaddr = kmap_atomic(dest_page, KM_USER0);
+                memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                pg_offset += bytes;
+                bytes_left -= bytes;
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        }
+        if (ret != Z_STREAM_END && bytes_left != 0) {
+                ret = -1;
+        } else {
+                ret = 0;
+        }
+        zlib_inflateEnd(&workspace->inf_strm);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
author	Chris Mason <chris.mason@oracle.com>	2008-10-29 14:49:59 -0400
committer	Chris Mason <chris.mason@oracle.com>	2008-10-29 14:49:59 -0400
commit	c8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree	873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/zlib.c
parent	26ce34a9c47334ff7984769e4661b2f1883594ff (diff)

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 000000000000..e99309180a11 --- /dev/null +++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
	1	/*
	2	* Copyright (C) 2008 Oracle. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public
	6	* License v2 as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	11	* General Public License for more details.
	12	*
	13	* You should have received a copy of the GNU General Public
	14	* License along with this program; if not, write to the
	15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	16	* Boston, MA 021110-1307, USA.
	17	*
	18	* Based on jffs2 zlib code:
	19	* Copyright © 2001-2007 Red Hat, Inc.
	20	* Created by David Woodhouse <dwmw2@infradead.org>
	21	*/
	22
	23	#include <linux/kernel.h>
	24	#include <linux/slab.h>
	25	#include <linux/zlib.h>
	26	#include <linux/zutil.h>
	27	#include <linux/vmalloc.h>
	28	#include <linux/init.h>
	29	#include <linux/err.h>
	30	#include <linux/sched.h>
	31	#include <linux/pagemap.h>
	32	#include <linux/bio.h>
	33
	34	/* Plan: call deflate() with avail_in == *sourcelen,
	35	avail_out = *dstlen - 12 and flush == Z_FINISH.
	36	If it doesn't manage to finish, call it again with
	37	avail_in == 0 and avail_out set to the remaining 12
	38	bytes for it to clean up.
	39	Q: Is 12 bytes sufficient?
	40	*/
	41	#define STREAM_END_SPACE 12
	42
	43	struct workspace {
	44	z_stream inf_strm;
	45	z_stream def_strm;
	46	char *buf;
	47	struct list_head list;
	48	};
	49
	50	static LIST_HEAD(idle_workspace);
	51	static DEFINE_SPINLOCK(workspace_lock);
	52	static unsigned long num_workspace;
	53	static atomic_t alloc_workspace = ATOMIC_INIT(0);
	54	static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
	55
	56	/*
	57	* this finds an available zlib workspace or allocates a new one
	58	* NULL or an ERR_PTR is returned if things go bad.
	59	*/
	60	static struct workspace *find_zlib_workspace(void)
	61	{
	62	struct workspace *workspace;
	63	int ret;
	64	int cpus = num_online_cpus();
	65
	66	again:
	67	spin_lock(&workspace_lock);
	68	if (!list_empty(&idle_workspace)) {
	69	workspace = list_entry(idle_workspace.next, struct workspace,
	70	list);
	71	list_del(&workspace->list);
	72	num_workspace--;
	73	spin_unlock(&workspace_lock);
	74	return workspace;
	75
	76	}
	77	spin_unlock(&workspace_lock);
	78	if (atomic_read(&alloc_workspace) > cpus) {
	79	DEFINE_WAIT(wait);
	80	prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
	81	if (atomic_read(&alloc_workspace) > cpus)
	82	schedule();
	83	finish_wait(&workspace_wait, &wait);
	84	goto again;
	85	}
	86	atomic_inc(&alloc_workspace);
	87	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
	88	if (!workspace) {
	89	ret = -ENOMEM;
	90	goto fail;
	91	}
	92
	93	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
	94	if (!workspace->def_strm.workspace) {
	95	ret = -ENOMEM;
	96	goto fail;
	97	}
	98	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
	99	if (!workspace->inf_strm.workspace) {
	100	ret = -ENOMEM;
	101	goto fail_inflate;
	102	}
	103	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
	104	if (!workspace->buf) {
	105	ret = -ENOMEM;
	106	goto fail_kmalloc;
	107	}
	108	return workspace;
	109
	110	fail_kmalloc:
	111	vfree(workspace->inf_strm.workspace);
	112	fail_inflate:
	113	vfree(workspace->def_strm.workspace);
	114	fail:
	115	kfree(workspace);
	116	atomic_dec(&alloc_workspace);
	117	wake_up(&workspace_wait);
	118	return ERR_PTR(ret);
	119	}
	120
	121	/*
	122	* put a workspace struct back on the list or free it if we have enough
	123	* idle ones sitting around
	124	*/
	125	static int free_workspace(struct workspace *workspace)
	126	{
	127	spin_lock(&workspace_lock);
	128	if (num_workspace < num_online_cpus()) {
	129	list_add_tail(&workspace->list, &idle_workspace);
	130	num_workspace++;
	131	spin_unlock(&workspace_lock);
	132	if (waitqueue_active(&workspace_wait))
	133	wake_up(&workspace_wait);
	134	return 0;
	135	}
	136	spin_unlock(&workspace_lock);
	137	vfree(workspace->def_strm.workspace);
	138	vfree(workspace->inf_strm.workspace);
	139	kfree(workspace->buf);
	140	kfree(workspace);
	141
	142	atomic_dec(&alloc_workspace);
	143	if (waitqueue_active(&workspace_wait))
	144	wake_up(&workspace_wait);
	145	return 0;
	146	}
	147
	148	/*
	149	* cleanup function for module exit
	150	*/
	151	static void free_workspaces(void)
	152	{
	153	struct workspace *workspace;
	154	while(!list_empty(&idle_workspace)) {
	155	workspace = list_entry(idle_workspace.next, struct workspace,
	156	list);
	157	list_del(&workspace->list);
	158	vfree(workspace->def_strm.workspace);
	159	vfree(workspace->inf_strm.workspace);
	160	kfree(workspace->buf);
	161	kfree(workspace);
	162	atomic_dec(&alloc_workspace);
	163	}
	164	}
	165
	166	/*
	167	* given an address space and start/len, compress the bytes.
	168	*
	169	* pages are allocated to hold the compressed result and stored
	170	* in 'pages'
	171	*
	172	* out_pages is used to return the number of pages allocated. There
	173	* may be pages allocated even if we return an error
	174	*
	175	* total_in is used to return the number of bytes actually read. It
	176	* may be smaller then len if we had to exit early because we
	177	* ran out of room in the pages array or because we cross the
	178	* max_out threshold.
	179	*
	180	* total_out is used to return the total number of compressed bytes
	181	*
	182	* max_out tells us the max number of bytes that we're allowed to
	183	* stuff into pages
	184	*/
	185	int btrfs_zlib_compress_pages(struct address_space *mapping,
	186	u64 start, unsigned long len,
	187	struct page **pages,
	188	unsigned long nr_dest_pages,
	189	unsigned long *out_pages,
	190	unsigned long *total_in,
	191	unsigned long *total_out,
	192	unsigned long max_out)
	193	{
	194	int ret;
	195	struct workspace *workspace;
	196	char *data_in;
	197	char *cpage_out;
	198	int nr_pages = 0;
	199	struct page *in_page = NULL;
	200	struct page *out_page = NULL;
	201	int out_written = 0;
	202	int in_read = 0;
	203	unsigned long bytes_left;
	204
	205	*out_pages = 0;
	206	*total_out = 0;
	207	*total_in = 0;
	208
	209	workspace = find_zlib_workspace();
	210	if (!workspace)
	211	return -1;
	212
	213	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
	214	printk(KERN_WARNING "deflateInit failed\n");
	215	ret = -1;
	216	goto out;
	217	}
	218
	219	workspace->def_strm.total_in = 0;
	220	workspace->def_strm.total_out = 0;
	221
	222	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
	223	data_in = kmap(in_page);
	224
	225	out_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
	226	cpage_out = kmap(out_page);
	227	pages[0] = out_page;
	228	nr_pages = 1;
	229
	230	workspace->def_strm.next_in = data_in;
	231	workspace->def_strm.next_out = cpage_out;
	232	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
	233	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
	234
	235	out_written = 0;
	236	in_read = 0;
	237
	238	while (workspace->def_strm.total_in < len) {
	239	ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
	240	if (ret != Z_OK) {
	241	printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
	242	ret);
	243	zlib_deflateEnd(&workspace->def_strm);
	244	ret = -1;
	245	goto out;
	246	}
	247
	248	/* we're making it bigger, give up */
	249	if (workspace->def_strm.total_in > 8192 &&
	250	workspace->def_strm.total_in <
	251	workspace->def_strm.total_out) {
	252	ret = -1;
	253	goto out;
	254	}
	255	/* we need another page for writing out. Test this
	256	* before the total_in so we will pull in a new page for
	257	* the stream end if required
	258	*/
	259	if (workspace->def_strm.avail_out == 0) {
	260	kunmap(out_page);
	261	if (nr_pages == nr_dest_pages) {
	262	out_page = NULL;
	263	ret = -1;
	264	goto out;
	265	}
	266	out_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
	267	cpage_out = kmap(out_page);
	268	pages[nr_pages] = out_page;
	269	nr_pages++;
	270	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
	271	workspace->def_strm.next_out = cpage_out;
	272	}
	273	/* we're all done */
	274	if (workspace->def_strm.total_in >= len)
	275	break;
	276
	277	/* we've read in a full page, get a new one */
	278	if (workspace->def_strm.avail_in == 0) {
	279	if (workspace->def_strm.total_out > max_out)
	280	break;
	281
	282	bytes_left = len - workspace->def_strm.total_in;
	283	kunmap(in_page);
	284	page_cache_release(in_page);
	285
	286	start += PAGE_CACHE_SIZE;
	287	in_page = find_get_page(mapping,
	288	start >> PAGE_CACHE_SHIFT);
	289	data_in = kmap(in_page);
	290	workspace->def_strm.avail_in = min(bytes_left,
	291	PAGE_CACHE_SIZE);
	292	workspace->def_strm.next_in = data_in;
	293	}
	294	}
	295	workspace->def_strm.avail_in = 0;
	296	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
	297	zlib_deflateEnd(&workspace->def_strm);
	298
	299	if (ret != Z_STREAM_END) {
	300	ret = -1;
	301	goto out;
	302	}
	303
	304	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
	305	ret = -1;
	306	goto out;
	307	}
	308
	309	ret = 0;
	310	*total_out = workspace->def_strm.total_out;
	311	*total_in = workspace->def_strm.total_in;
	312	out:
	313	*out_pages = nr_pages;
	314	if (out_page)
	315	kunmap(out_page);
	316
	317	if (in_page) {
	318	kunmap(in_page);
	319	page_cache_release(in_page);
	320	}
	321	free_workspace(workspace);
	322	return ret;
	323	}
	324
	325	/*
	326	* pages_in is an array of pages with compressed data.
	327	*
	328	* disk_start is the starting logical offset of this array in the file
	329	*
	330	* bvec is a bio_vec of pages from the file that we want to decompress into
	331	*
	332	* vcnt is the count of pages in the biovec
	333	*
	334	* srclen is the number of bytes in pages_in
	335	*
	336	* The basic idea is that we have a bio that was created by readpages.
	337	* The pages in the bio are for the uncompressed data, and they may not
	338	* be contiguous. They all correspond to the range of bytes covered by
	339	* the compressed extent.
	340	*/
	341	int btrfs_zlib_decompress_biovec(struct page **pages_in,
	342	u64 disk_start,
	343	struct bio_vec *bvec,
	344	int vcnt,
	345	size_t srclen)
	346	{
	347	int ret = 0;
	348	int wbits = MAX_WBITS;
	349	struct workspace *workspace;
	350	char *data_in;
	351	size_t total_out = 0;
	352	unsigned long page_bytes_left;
	353	unsigned long page_in_index = 0;
	354	unsigned long page_out_index = 0;
	355	struct page *page_out;
	356	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
	357	PAGE_CACHE_SIZE;
	358	unsigned long buf_start;
	359	unsigned long buf_offset;
	360	unsigned long bytes;
	361	unsigned long working_bytes;
	362	unsigned long pg_offset;
	363	unsigned long start_byte;
	364	unsigned long current_buf_start;
	365	char *kaddr;
	366
	367	workspace = find_zlib_workspace();
	368	if (!workspace)
	369	return -ENOMEM;
	370
	371	data_in = kmap(pages_in[page_in_index]);
	372	workspace->inf_strm.next_in = data_in;
	373	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
	374	workspace->inf_strm.total_in = 0;
	375
	376	workspace->inf_strm.total_out = 0;
	377	workspace->inf_strm.next_out = workspace->buf;
	378	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
	379	page_out = bvec[page_out_index].bv_page;
	380	page_bytes_left = PAGE_CACHE_SIZE;
	381	pg_offset = 0;
	382
	383	/* If it's deflate, and it's got no preset dictionary, then
	384	we can tell zlib to skip the adler32 check. */
	385	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
	386	((data_in[0] & 0x0f) == Z_DEFLATED) &&
	387	!(((data_in[0]<<8) + data_in[1]) % 31)) {
	388
	389	wbits = -((data_in[0] >> 4) + 8);
	390	workspace->inf_strm.next_in += 2;
	391	workspace->inf_strm.avail_in -= 2;
	392	}
	393
	394	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
	395	printk(KERN_WARNING "inflateInit failed\n");
	396	ret = -1;
	397	goto out;
	398	}
	399	while(workspace->inf_strm.total_in < srclen) {
	400	ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
	401	if (ret != Z_OK && ret != Z_STREAM_END) {
	402	break;
	403	}
	404
	405	/*
	406	* buf start is the byte offset we're of the start of
	407	* our workspace buffer
	408	*/
	409	buf_start = total_out;
	410
	411	/* total_out is the last byte of the workspace buffer */
	412	total_out = workspace->inf_strm.total_out;
	413
	414	working_bytes = total_out - buf_start;
	415
	416	/*
	417	* start byte is the first byte of the page we're currently
	418	* copying into relative to the start of the compressed data.
	419	*/
	420	start_byte = page_offset(page_out) - disk_start;
	421
	422	if (working_bytes == 0) {
	423	/* we didn't make progress in this inflate
	424	* call, we're done
	425	*/
	426	if (ret != Z_STREAM_END)
	427	ret = -1;
	428	break;
	429	}
	430
	431	/* we haven't yet hit data corresponding to this page */
	432	if (total_out <= start_byte) {
	433	goto next;
	434	}
	435
	436	/*
	437	* the start of the data we care about is offset into
	438	* the middle of our working buffer
	439	*/
	440	if (total_out > start_byte && buf_start < start_byte) {
	441	buf_offset = start_byte - buf_start;
	442	working_bytes -= buf_offset;
	443	} else {
	444	buf_offset = 0;
	445	}
	446	current_buf_start = buf_start;
	447
	448	/* copy bytes from the working buffer into the pages */
	449	while(working_bytes > 0) {
	450	bytes = min(PAGE_CACHE_SIZE - pg_offset,
	451	PAGE_CACHE_SIZE - buf_offset);
	452	bytes = min(bytes, working_bytes);
	453	kaddr = kmap_atomic(page_out, KM_USER0);
	454	memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
	455	bytes);
	456	kunmap_atomic(kaddr, KM_USER0);
	457	flush_dcache_page(page_out);
	458
	459	pg_offset += bytes;
	460	page_bytes_left -= bytes;
	461	buf_offset += bytes;
	462	working_bytes -= bytes;
	463	current_buf_start += bytes;
	464
	465	/* check if we need to pick another page */
	466	if (page_bytes_left == 0) {
	467	page_out_index++;
	468	if (page_out_index >= vcnt) {
	469	ret = 0;
	470	goto done;
	471	}
	472	page_out = bvec[page_out_index].bv_page;
	473	pg_offset = 0;
	474	page_bytes_left = PAGE_CACHE_SIZE;
	475	start_byte = page_offset(page_out) - disk_start;
	476
	477	/*
	478	* make sure our new page is covered by this
	479	* working buffer
	480	*/
	481	if (total_out <= start_byte) {
	482	goto next;
	483	}
	484
	485	/* the next page in the biovec might not
	486	* be adjacent to the last page, but it
	487	* might still be found inside this working
	488	* buffer. bump our offset pointer
	489	*/
	490	if (total_out > start_byte &&
	491	current_buf_start < start_byte) {
	492	buf_offset = start_byte - buf_start;
	493	working_bytes = total_out - start_byte;
	494	current_buf_start = buf_start +
	495	buf_offset;
	496	}
	497	}
	498	}
	499	next:
	500	workspace->inf_strm.next_out = workspace->buf;
	501	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
	502
	503	if (workspace->inf_strm.avail_in == 0) {
	504	unsigned long tmp;
	505	kunmap(pages_in[page_in_index]);
	506	page_in_index++;
	507	if (page_in_index >= total_pages_in) {
	508	data_in = NULL;
	509	break;
	510	}
	511	data_in = kmap(pages_in[page_in_index]);
	512	workspace->inf_strm.next_in = data_in;
	513	tmp = srclen - workspace->inf_strm.total_in;
	514	workspace->inf_strm.avail_in = min(tmp,
	515	PAGE_CACHE_SIZE);
	516	}
	517	}
	518	if (ret != Z_STREAM_END) {
	519	ret = -1;
	520	} else {
	521	ret = 0;
	522	}
	523	done:
	524	zlib_inflateEnd(&workspace->inf_strm);
	525	if (data_in)
	526	kunmap(pages_in[page_in_index]);
	527	out:
	528	free_workspace(workspace);
	529	return ret;
	530	}
	531
	532	/*
	533	* a less complex decompression routine. Our compressed data fits in a
	534	* single page, and we want to read a single page out of it.
	535	* start_byte tells us the offset into the compressed data we're interested in
	536	*/
	537	int btrfs_zlib_decompress(unsigned char *data_in,
	538	struct page *dest_page,
	539	unsigned long start_byte,
	540	size_t srclen, size_t destlen)
	541	{
	542	int ret = 0;
	543	int wbits = MAX_WBITS;
	544	struct workspace *workspace;
	545	unsigned long bytes_left = destlen;
	546	unsigned long total_out = 0;
	547	char *kaddr;
	548
	549	if (destlen > PAGE_CACHE_SIZE)
	550	return -ENOMEM;
	551
	552	workspace = find_zlib_workspace();
	553	if (!workspace)
	554	return -ENOMEM;
	555
	556	workspace->inf_strm.next_in = data_in;
	557	workspace->inf_strm.avail_in = srclen;
	558	workspace->inf_strm.total_in = 0;
	559
	560	workspace->inf_strm.next_out = workspace->buf;
	561	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
	562	workspace->inf_strm.total_out = 0;
	563	/* If it's deflate, and it's got no preset dictionary, then
	564	we can tell zlib to skip the adler32 check. */
	565	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
	566	((data_in[0] & 0x0f) == Z_DEFLATED) &&
	567	!(((data_in[0]<<8) + data_in[1]) % 31)) {
	568
	569	wbits = -((data_in[0] >> 4) + 8);
	570	workspace->inf_strm.next_in += 2;
	571	workspace->inf_strm.avail_in -= 2;
	572	}
	573
	574	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
	575	printk(KERN_WARNING "inflateInit failed\n");
	576	ret = -1;
	577	goto out;
	578	}
	579
	580	while(bytes_left > 0) {
	581	unsigned long buf_start;
	582	unsigned long buf_offset;
	583	unsigned long bytes;
	584	unsigned long pg_offset = 0;
	585
	586	ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
	587	if (ret != Z_OK && ret != Z_STREAM_END) {
	588	break;
	589	}
	590
	591	buf_start = total_out;
	592	total_out = workspace->inf_strm.total_out;
	593
	594	if (total_out == buf_start) {
	595	ret = -1;
	596	break;
	597	}
	598
	599	if (total_out <= start_byte) {
	600	goto next;
	601	}
	602
	603	if (total_out > start_byte && buf_start < start_byte) {
	604	buf_offset = start_byte - buf_start;
	605	} else {
	606	buf_offset = 0;
	607	}
	608
	609	bytes = min(PAGE_CACHE_SIZE - pg_offset,
	610	PAGE_CACHE_SIZE - buf_offset);
	611	bytes = min(bytes, bytes_left);
	612
	613	kaddr = kmap_atomic(dest_page, KM_USER0);
	614	memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
	615	kunmap_atomic(kaddr, KM_USER0);
	616
	617	pg_offset += bytes;
	618	bytes_left -= bytes;
	619	next:
	620	workspace->inf_strm.next_out = workspace->buf;
	621	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
	622	}
	623	if (ret != Z_STREAM_END && bytes_left != 0) {
	624	ret = -1;
	625	} else {
	626	ret = 0;
	627	}
	628	zlib_inflateEnd(&workspace->inf_strm);
	629	out:
	630	free_workspace(workspace);
	631	return ret;
	632	}
	633
	634	void btrfs_zlib_exit(void)
	635	{
	636	free_workspaces();
	637	}