21 files changed, 2313 insertions, 379 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142b..d2cf5a54a4b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           compression.o
 else
 # Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..c5470367ca5c
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+struct compressed_bio {
+        /* number of bios pending for this compressed extent */
+        atomic_t pending_bios;
+        /* the pages with the compressed data on them */
+        struct page **compressed_pages;
+        /* inode that owns this data */
+        struct inode *inode;
+        /* starting offset in the inode for our pages */
+        u64 start;
+        /* number of bytes in the inode we're working on */
+        unsigned long len;
+        /* number of bytes on disk */
+        unsigned long compressed_len;
+        /* number of compressed pages in the array */
+        unsigned long nr_pages;
+        /* IO errors */
+        int errors;
+        /* for reads, this is the bio we are copying the data into */
+        struct bio *orig_bio;
+};
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+                                        u64 first_byte, gfp_t gfp_flags)
+{
+        struct bio *bio;
+        int nr_vecs;
+        nr_vecs = bio_get_nr_vecs(bdev);
+        bio = bio_alloc(gfp_flags, nr_vecs);
+        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+                while (!bio && (nr_vecs /= 2))
+                        bio = bio_alloc(gfp_flags, nr_vecs);
+        }
+        if (bio) {
+                bio->bi_size = 0;
+                bio->bi_bdev = bdev;
+                bio->bi_sector = first_byte >> 9;
+        }
+        return bio;
+}
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        int ret;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        /* ok, we're the last bio for this extent, lets start
+         * the decompression.
+         */
+        inode = cb->inode;
+        tree = &BTRFS_I(inode)->io_tree;
+        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                        cb->start,
+                                        cb->orig_bio->bi_io_vec,
+                                        cb->orig_bio->bi_vcnt,
+                                        cb->compressed_len);
+        if (ret)
+                cb->errors = 1;
+        /* release the compressed pages */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* do io completion on the original bio */
+        if (cb->errors)
+                bio_io_error(cb->orig_bio);
+        else
+                bio_endio(cb->orig_bio, 0);
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+                                             unsigned long ram_size)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+        struct page *pages[16];
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int ret;
+        while(nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min(nr_pages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        nr_pages -= 1;
+                        index += 1;
+                        continue;
+                }
+                for (i = 0; i < ret; i++) {
+                        end_page_writeback(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+        }
+        /* the inode may be gone now */
+        return 0;
+}
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        /* ok, we're the last bio for this extent, step one is to
+         * call back into the FS and do all the end_io operations
+         */
+        inode = cb->inode;
+        tree = &BTRFS_I(inode)->io_tree;
+        tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+                                         cb->start,
+                                         cb->start + cb->len - 1,
+                                         NULL, 1);
+        end_compressed_writeback(inode, cb->start, cb->len);
+        /* note, our inode could be gone now */
+        /*
+         * release the compressed pages, these came from alloc_page and
+         * are not attached to the inode at all
+         */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                 unsigned long len, u64 disk_start,
+                                 unsigned long compressed_len,
+                                 struct page **compressed_pages,
+                                 unsigned long nr_pages)
+{
+        struct bio *bio = NULL;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct compressed_bio *cb;
+        unsigned long bytes_left;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int page_index = 0;
+        struct page *page;
+        u64 first_byte = disk_start;
+        struct block_device *bdev;
+        int ret;
+        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+        cb = kmalloc(sizeof(*cb), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->start = start;
+        cb->len = len;
+        cb->compressed_pages = compressed_pages;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = NULL;
+        cb->nr_pages = nr_pages;
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        ret = btrfs_csum_file_bytes(root, inode, start, len);
+        BUG_ON(ret);
+        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+        bio->bi_private = cb;
+        bio->bi_end_io = end_compressed_bio_write;
+        atomic_inc(&cb->pending_bios);
+        /* create and submit bios for the compressed pages */
+        bytes_left = compressed_len;
+        while(bytes_left > 0) {
+                page = compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                if (bio->bi_size)
+                        ret = io_tree->ops->merge_bio_hook(page, 0,
+                                                           PAGE_CACHE_SIZE,
+                                                           bio, 0);
+                else
+                        ret = 0;
+                if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(bio);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+                        BUG_ON(ret);
+                        bio_put(bio);
+                        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+                        atomic_inc(&cb->pending_bios);
+                        bio->bi_private = cb;
+                        bio->bi_end_io = end_compressed_bio_write;
+                        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                page_index++;
+                bytes_left -= PAGE_CACHE_SIZE;
+                first_byte += PAGE_CACHE_SIZE;
+        }
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        BUG_ON(ret);
+        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+        BUG_ON(ret);
+        bio_put(bio);
+        return 0;
+}
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *em_tree;
+        struct compressed_bio *cb;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+        unsigned long compressed_len;
+        unsigned long nr_pages;
+        unsigned long page_index;
+        struct page *page;
+        struct block_device *bdev;
+        struct bio *comp_bio;
+        u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+        struct extent_map *em;
+        int ret;
+        tree = &BTRFS_I(inode)->io_tree;
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        /* we need the actual starting offset of this extent in the file */
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree,
+                                   page_offset(bio->bi_io_vec->bv_page),
+                                   PAGE_CACHE_SIZE);
+        spin_unlock(&em_tree->lock);
+        cb = kmalloc(sizeof(*cb), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->start = em->start;
+        compressed_len = em->block_len;
+        free_extent_map(em);
+        cb->len = uncompressed_len;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = bio;
+        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+                                 PAGE_CACHE_SIZE;
+        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+                                       GFP_NOFS);
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                                                              __GFP_HIGHMEM);
+        }
+        cb->nr_pages = nr_pages;
+        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        comp_bio->bi_private = cb;
+        comp_bio->bi_end_io = end_compressed_bio_read;
+        atomic_inc(&cb->pending_bios);
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                page = cb->compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                if (comp_bio->bi_size)
+                        ret = tree->ops->merge_bio_hook(page, 0,
+                                                        PAGE_CACHE_SIZE,
+                                                        comp_bio, 0);
+                else
+                        ret = 0;
+                if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(comp_bio);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+                        BUG_ON(ret);
+                        bio_put(comp_bio);
+                        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+                                                        GFP_NOFS);
+                        atomic_inc(&cb->pending_bios);
+                        bio->bi_private = cb;
+                        bio->bi_end_io = end_compressed_bio_write;
+                        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                cur_disk_byte += PAGE_CACHE_SIZE;
+        }
+        bio_get(comp_bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+        BUG_ON(ret);
+        ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+        BUG_ON(ret);
+        bio_put(comp_bio);
+        return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                  unsigned long len, u64 disk_start,
+                                  unsigned long compressed_len,
+                                  struct page **compressed_pages,
+                                  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47f..793d8fdda244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
        __le32 nsec;
 } __attribute__ ((__packed__));
-/*
+typedef enum {
- * there is no padding here on purpose.  If you want to extent the inode,
+        BTRFS_COMPRESS_NONE = 0,
- * make a new item type
+        BTRFS_COMPRESS_ZLIB = 1,
- */
+        BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+/* we don't understand any encryption methods right now */
+typedef enum {
+        BTRFS_ENCRYPTION_NONE = 0,
+        BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
 struct btrfs_inode_item {
        /* nfs style generation number */
        __le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
        __le64 rdev;
        __le16 flags;
        __le16 compat_flags;
        struct btrfs_timespec atime;
        struct btrfs_timespec ctime;
        struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
 #define BTRFS_FILE_EXTENT_INLINE 1
 struct btrfs_file_extent_item {
+        /*
+         * transaction id that created this extent
+         */
        __le64 generation;
+        /*
+         * max number of bytes to hold this extent in ram
+         * when we split a compressed extent we can't know how big
+         * each of the resulting pieces will be.  So, this is
+         * an upper limit on the size of the extent in ram instead of
+         * an exact limit.
+         */
+        __le64 ram_bytes;
+        /*
+         * 32 bits for the various ways we might encode the data,
+         * including compression and encryption.  If any of these
+         * are set to something a given disk format doesn't understand
+         * it is treated like an incompat flag for reading and writing,
+         * but not for stat.
+         */
+        u8 compression;
+        u8 encryption;
+        __le16 other_encoding; /* spare for later use */
+        /* are we inline data or a real extent? */
        u8 type;
        /*
         * disk space consumed by the extent, checksum blocks are included
         * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
         */
        __le64 offset;
        /*
-         * the logical number of file blocks (no csums included)
+         * the logical number of file blocks (no csums included).  This
+         * always reflects the size uncompressed and without encoding.
         */
        __le64 num_bytes;
 } __attribute__ ((__packed__));
 struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOBARRIER           (1 << 2)
 #define BTRFS_MOUNT_SSD                 (1 << 3)
 #define BTRFS_MOUNT_DEGRADED            (1 << 4)
+#define BTRFS_MOUNT_COMPRESS            (1 << 5)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATASUM           (1 << 0)
 #define BTRFS_INODE_NODATACOW           (1 << 1)
 #define BTRFS_INODE_READONLY            (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS          (1 << 3)
 #define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
                                         ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
        return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-                                               struct btrfs_item *e)
-{
-        unsigned long offset;
-        offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-        return btrfs_item_size(eb, e) - offset;
-}
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
                   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
                  offset, 64);
 BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
                   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+                   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+                   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+                   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+                   other_encoding, 16);
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                               struct btrfs_file_extent_item *e)
+{
+        return btrfs_file_extent_ram_bytes(eb, e);
+}
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+                                                    struct btrfs_item *e)
+{
+        unsigned long offset;
+        offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+        return btrfs_item_size(eb, e) - offset;
+}
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
+                             struct btrfs_root *root,
-                               u64 objectid, u64 pos, u64 disk_offset,
+                             u64 objectid, u64 pos,
-                               u64 disk_num_bytes,
+                             u64 disk_offset, u64 disk_num_bytes,
-                             u64 num_bytes, u64 offset);
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                          u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
                                  int namelen);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                         size_t size, struct bio *bio);
+                         size_t size, struct bio *bio, unsigned long bio_flags);
 unsigned long btrfs_force_ra(struct address_space *mapping,
                              struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb6194..dc95f636a11b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
        extent_submit_bio_hook_t *submit_bio_hook;
        int rw;
        int mirror_num;
+        unsigned long bio_flags;
        struct btrfs_work work;
 };
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
        }
        em->start = 0;
        em->len = (u64)-1;
+        em->block_len = (u64)-1;
        em->block_start = 0;
        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
                wake_up(&fs_info->async_submit_wait);
        async->submit_bio_hook(async->inode, async->rw, async->bio,
-                               async->mirror_num);
+                               async->mirror_num, async->bio_flags);
        kfree(async);
 }
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
                        extent_submit_bio_hook_t *submit_bio_hook)
 {
        struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->submit_bio_hook = submit_bio_hook;
        async->work.func = run_one_async_submit;
        async->work.flags = 0;
+        async->bio_flags = bio_flags;
        while(atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
 }
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num)
+                                 int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num)
+                                 int mirror_num, unsigned long bio_flags)
 {
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
         */
        if (!(rw & (1 << BIO_RW))) {
-                return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+                return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
        }
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                   inode, rw, bio, mirror_num,
+                                   inode, rw, bio, mirror_num, 0,
                                   __btree_submit_bio_hook);
 }
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
        fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
        INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         */
        btrfs_init_workers(&fs_info->workers, "worker",
                           fs_info->thread_pool_size);
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
        nodesize = btrfs_super_nodesize(disk_super);
        leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbbb..4eb1f1408d21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
                        extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6d..bbf04e80a1a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
        em->start = extent_key->objectid - offset;
        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
        em->block_start = extent_key->objectid;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
 };
 struct disk_extent {
+        u64 ram_bytes;
        u64 disk_bytenr;
        u64 disk_num_bytes;
        u64 offset;
        u64 num_bytes;
+        u8 compression;
+        u8 encryption;
+        u16 other_encoding;
 };
 static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
                        btrfs_file_extent_disk_num_bytes(leaf, fi);
                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                           fi);
                WARN_ON(exts[nr].offset > 0);
                WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
@@ -3846,6 +3856,8 @@ next:
                                                new_extents[0].disk_bytenr);
                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                new_extents[0].disk_num_bytes);
+                        btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                                new_extents[0].ram_bytes);
                        ext_offset += new_extents[0].offset;
                        btrfs_set_file_extent_offset(leaf, fi, ext_offset);
                        btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
                                                new_extents[i].disk_bytenr);
                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                new_extents[i].disk_num_bytes);
+                                btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                                new_extents[i].ram_bytes);
+                                btrfs_set_file_extent_compression(leaf, fi,
+                                                new_extents[i].compression);
+                                btrfs_set_file_extent_encryption(leaf, fi,
+                                                new_extents[i].encryption);
+                                btrfs_set_file_extent_other_encoding(leaf, fi,
+                                                new_extents[i].other_encoding);
                                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_len);
                                ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+                btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                                new_extent->ram_bytes);
                btrfs_set_file_extent_disk_bytenr(leaf, fi,
                                                new_extent->disk_bytenr);
                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
        BUG_ON(err);
        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                       group->key.offset, 0);
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
        BUG_ON(err);
        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f2..314041fdfa43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
+#define LEAK_DEBUG 1
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
 *
 * 1 is returned if we find something, 0 if nothing was in the tree
 */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
-                                             u64 *start, u64 *end, u64 max_bytes)
+                                        u64 *start, u64 *end, u64 max_bytes)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
        u64 total_bytes = 0;
        spin_lock_irq(&tree->lock);
        /*
         * this search will find all the extents that end after
         * our range starts.
         */
-search_again:
        node = tree_search(tree, cur_start);
        if (!node) {
                if (!found)
@@ -1100,40 +1101,6 @@ search_again:
                                *end = state->end;
                        goto out;
                }
-                if (!found && !(state->state & EXTENT_BOUNDARY)) {
-                        struct extent_state *prev_state;
-                        struct rb_node *prev_node = node;
-                        while(1) {
-                                prev_node = rb_prev(prev_node);
-                                if (!prev_node)
-                                        break;
-                                prev_state = rb_entry(prev_node,
-                                                      struct extent_state,
-                                                      rb_node);
-                                if ((prev_state->end + 1 != state->start) ||
-                                    !(prev_state->state & EXTENT_DELALLOC))
-                                        break;
-                                if ((cur_start - prev_state->start) * 2 >
-                                     max_bytes)
-                                        break;
-                                state = prev_state;
-                                node = prev_node;
-                        }
-                }
-                if (state->state & EXTENT_LOCKED) {
-                        DEFINE_WAIT(wait);
-                        atomic_inc(&state->refs);
-                        prepare_to_wait(&state->wq, &wait,
-                                        TASK_UNINTERRUPTIBLE);
-                        spin_unlock_irq(&tree->lock);
-                        schedule();
-                        spin_lock_irq(&tree->lock);
-                        finish_wait(&state->wq, &wait);
-                        free_extent_state(state);
-                        goto search_again;
-                }
-                set_state_cb(tree, state, EXTENT_LOCKED);
-                state->state |= EXTENT_LOCKED;
                if (!found)
                        *start = state->start;
                found++;
@@ -1151,6 +1118,208 @@ out:
        return found;
 }
+static noinline int __unlock_for_delalloc(struct inode *inode,
+                                          struct page *locked_page,
+                                          u64 start, u64 end)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        if (index == locked_page->index && end_index == index)
+                return 0;
+        while(nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min(nr_pages, ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] != locked_page)
+                                unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+static noinline int lock_delalloc_pages(struct inode *inode,
+                                        struct page *locked_page,
+                                        u64 delalloc_start,
+                                        u64 delalloc_end)
+{
+        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+        unsigned long start_index = index;
+        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+        unsigned long pages_locked = 0;
+        struct page *pages[16];
+        unsigned long nrpages;
+        int ret;
+        int i;
+        /* the caller is responsible for locking the start index */
+        if (index == locked_page->index && index == end_index)
+                return 0;
+        /* skip the page at the start index */
+        nrpages = end_index - index + 1;
+        while(nrpages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min(nrpages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        ret = -EAGAIN;
+                        goto done;
+                }
+                /* now we have an array of pages, lock them all */
+                for (i = 0; i < ret; i++) {
+                        /*
+                         * the caller is taking responsibility for
+                         * locked_page
+                         */
+                        if (pages[i] != locked_page)
+                                lock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                pages_locked += ret;
+                nrpages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        ret = 0;
+done:
+        if (ret && pages_locked) {
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start,
+                              ((u64)(start_index + pages_locked - 1)) <<
+                              PAGE_CACHE_SHIFT);
+        }
+        return ret;
+}
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                             struct extent_io_tree *tree,
+                                             struct page *locked_page,
+                                             u64 *start, u64 *end,
+                                             u64 max_bytes)
+{
+        u64 delalloc_start;
+        u64 delalloc_end;
+        u64 found;
+        int ret;
+        int loops = 0;
+again:
+        /* step one, find a bunch of delalloc bytes starting at start */
+        delalloc_start = *start;
+        delalloc_end = 0;
+        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+                                    max_bytes);
+        if (!found) {
+                *start = delalloc_start;
+                *end = delalloc_end;
+                return found;
+        }
+        /*
+         * make sure to limit the number of pages we try to lock down
+         * if we're looping.
+         */
+        if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+                delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+                        ~((u64)PAGE_CACHE_SIZE - 1);
+        }
+        /* step two, lock all the pages after the page that has start */
+        ret = lock_delalloc_pages(inode, locked_page,
+                                  delalloc_start, delalloc_end);
+        if (ret == -EAGAIN) {
+                /* some of the pages are gone, lets avoid looping by
+                 * shortening the size of the delalloc range we're searching
+                 */
+                if (!loops) {
+                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+                        max_bytes = PAGE_CACHE_SIZE - offset;
+                        loops = 1;
+                        goto again;
+                } else {
+                        found = 0;
+                        goto out_failed;
+                }
+        }
+        BUG_ON(ret);
+        /* step three, lock the state bits for the whole range */
+        lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+        /* then test to make sure it is all still delalloc */
+        ret = test_range_bit(tree, delalloc_start, delalloc_end,
+                             EXTENT_DELALLOC, 1);
+        if (!ret) {
+                unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start, delalloc_end);
+                cond_resched();
+                goto again;
+        }
+        *start = delalloc_start;
+        *end = delalloc_end;
+out_failed:
+        return found;
+}
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int clear_dirty, int set_writeback,
+                                int end_writeback)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+        if (clear_dirty)
+                clear_bits |= EXTENT_DIRTY;
+        clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+        while(nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min(nr_pages, ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] == locked_page) {
+                                page_cache_release(pages[i]);
+                                continue;
+                        }
+                        if (clear_dirty)
+                                clear_page_dirty_for_io(pages[i]);
+                        if (set_writeback)
+                                set_page_writeback(pages[i]);
+                        if (end_writeback)
+                                end_page_writeback(pages[i]);
+                        unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
 /*
 * count the number of bytes in the tree that have a given bit(s)
 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        return bio;
 }
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+                          unsigned long bio_flags)
 {
        int ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
-        struct rb_node *node;
-        struct extent_state *state;
        u64 start;
        u64 end;
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
        end = start + bvec->bv_len - 1;
-        spin_lock_irq(&tree->lock);
-        node = __etree_search(tree, start, NULL, NULL);
-        BUG_ON(!node);
-        state = rb_entry(node, struct extent_state, rb_node);
-        while(state->end < end) {
-                node = rb_next(node);
-                state = rb_entry(node, struct extent_state, rb_node);
-        }
-        BUG_ON(state->end != end);
-        spin_unlock_irq(&tree->lock);
        bio->bi_private = NULL;
        bio_get(bio);
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                           mirror_num);
+                                           mirror_num, bio_flags);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                              struct bio **bio_ret,
                              unsigned long max_pages,
                              bio_end_io_t end_io_func,
-                              int mirror_num)
+                              int mirror_num,
+                              unsigned long prev_bio_flags,
+                              unsigned long bio_flags)
 {
        int ret = 0;
        struct bio *bio;
        int nr;
+        int contig = 0;
+        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+        size_t page_size = min(size, PAGE_CACHE_SIZE);
        if (bio_ret && *bio_ret) {
                bio = *bio_ret;
-                if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+                if (old_compressed)
+                        contig = bio->bi_sector == sector;
+                else
+                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
+                                sector;
+                if (prev_bio_flags != bio_flags || !contig ||
                    (tree->ops && tree->ops->merge_bio_hook &&
-                     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
+                     tree->ops->merge_bio_hook(page, offset, page_size, bio,
-                    bio_add_page(bio, page, size, offset) < size) {
+                                               bio_flags)) ||
-                        ret = submit_one_bio(rw, bio, mirror_num);
+                    bio_add_page(bio, page, page_size, offset) < page_size) {
+                        ret = submit_one_bio(rw, bio, mirror_num,
+                                             prev_bio_flags);
                        bio = NULL;
                } else {
                        return 0;
                }
        }
-        nr = bio_get_nr_vecs(bdev);
+        if (this_compressed)
+                nr = BIO_MAX_PAGES;
+        else
+                nr = bio_get_nr_vecs(bdev);
        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
        if (!bio) {
                printk("failed to allocate bio nr %d\n", nr);
        }
+        bio_add_page(bio, page, page_size, offset);
-        bio_add_page(bio, page, size, offset);
        bio->bi_end_io = end_io_func;
        bio->bi_private = tree;
        if (bio_ret) {
                *bio_ret = bio;
        } else {
-                ret = submit_one_bio(rw, bio, mirror_num);
+                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
        }
        return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
                                   struct page *page,
                                   get_extent_t *get_extent,
-                                   struct bio **bio, int mirror_num)
+                                   struct bio **bio, int mirror_num,
+                                   unsigned long *bio_flags)
 {
        struct inode *inode = page->mapping->host;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        int nr = 0;
        size_t page_offset = 0;
        size_t iosize;
+        size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
+        unsigned long this_bio_flag = 0;
        set_page_extent_mapped(page);
        end = page_end;
        lock_extent(tree, start, end, GFP_NOFS);
+        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+                char *userpage;
+                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+                if (zero_offset) {
+                        iosize = PAGE_CACHE_SIZE - zero_offset;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + zero_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                }
+        }
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
                }
                BUG_ON(end < cur);
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                        this_bio_flag = EXTENT_BIO_COMPRESSED;
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-                sector = (em->block_start + extent_offset) >> 9;
+                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+                        disk_io_size = em->block_len;
+                        sector = em->block_start >> 9;
+                } else {
+                        sector = (em->block_start + extent_offset) >> 9;
+                        disk_io_size = iosize;
+                }
                bdev = em->bdev;
                block_start = em->block_start;
                free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                        pnr -= page->index;
                        ret = submit_extent_page(READ, tree, page,
-                                         sector, iosize, page_offset,
+                                         sector, disk_io_size, page_offset,
                                         bdev, bio, pnr,
-                                         end_bio_extent_readpage, mirror_num);
+                                         end_bio_extent_readpage, mirror_num,
+                                         *bio_flags,
+                                         this_bio_flag);
                        nr++;
+                        *bio_flags = this_bio_flag;
                }
                if (ret)
                        SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
                            get_extent_t *get_extent)
 {
        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
        int ret;
-        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+                                      &bio_flags);
        if (bio)
-                submit_one_bio(READ, bio, 0);
+                submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
        u64 nr_delalloc;
        u64 delalloc_end;
+        int page_started;
+        int compressed;
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_start = start;
        delalloc_end = 0;
+        page_started = 0;
        while(delalloc_end < page_end) {
-                nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+                nr_delalloc = find_lock_delalloc_range(inode, tree,
+                                                       page,
+                                                       &delalloc_start,
                                                       &delalloc_end,
                                                       128 * 1024 * 1024);
                if (nr_delalloc == 0) {
                        delalloc_start = delalloc_end + 1;
                        continue;
                }
-                tree->ops->fill_delalloc(inode, delalloc_start,
+                tree->ops->fill_delalloc(inode, page, delalloc_start,
-                                         delalloc_end);
+                                         delalloc_end, &page_started);
-                clear_extent_bit(tree, delalloc_start,
-                                 delalloc_end,
-                                 EXTENT_LOCKED | EXTENT_DELALLOC,
-                                 1, 0, GFP_NOFS);
                delalloc_start = delalloc_end + 1;
        }
+        /* did the fill delalloc function already unlock and start the IO? */
+        if (page_started) {
+                return 0;
+        }
        lock_extent(tree, start, page_end, GFP_NOFS);
        unlock_start = start;
        if (tree->ops && tree->ops->writepage_start_hook) {
-                ret = tree->ops->writepage_start_hook(page, start, page_end);
+                ret = tree->ops->writepage_start_hook(page, start,
+                                                      page_end);
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                sector = (em->block_start + extent_offset) >> 9;
                bdev = em->bdev;
                block_start = em->block_start;
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                free_extent_map(em);
                em = NULL;
-                if (block_start == EXTENT_MAP_HOLE ||
+                /*
+                 * compressed and inline extents are written through other
+                 * paths in the FS
+                 */
+                if (compressed || block_start == EXTENT_MAP_HOLE ||
                    block_start == EXTENT_MAP_INLINE) {
                        clear_extent_dirty(tree, cur,
                                           cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        unlock_extent(tree, unlock_start, cur + iosize -1,
                                      GFP_NOFS);
-                        if (tree->ops && tree->ops->writepage_end_io_hook)
+                        /*
+                         * end_io notification does not happen here for
+                         * compressed extents
+                         */
+                        if (!compressed && tree->ops &&
+                            tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         cur + iosize - 1,
                                                         NULL, 1);
-                        cur = cur + iosize;
+                        else if (compressed) {
+                                /* we don't want to end_page_writeback on
+                                 * a compressed extent.  this happens
+                                 * elsewhere
+                                 */
+                                nr++;
+                        }
+                        cur += iosize;
                        pg_offset += iosize;
                        unlock_start = cur;
                        continue;
                }
                /* leave this out until we have a page_mkwrite call */
                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
                                   EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        pg_offset += iosize;
                        continue;
                }
                clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
                if (tree->ops && tree->ops->writepage_io_hook) {
                        ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        ret = submit_extent_page(WRITE, tree, page, sector,
                                                 iosize, pg_offset, bdev,
                                                 &epd->bio, max_nr,
-                                                 end_bio_extent_writepage, 0);
+                                                 end_bio_extent_writepage,
+                                                 0, 0, 0);
                        if (ret)
                                SetPageError(page);
                }
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd);
        if (epd.bio) {
-                submit_one_bio(WRITE, epd.bio, 0);
+                submit_one_bio(WRITE, epd.bio, 0, 0);
        }
        return ret;
 }
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd);
        if (epd.bio) {
-                submit_one_bio(WRITE, epd.bio, 0);
+                submit_one_bio(WRITE, epd.bio, 0, 0);
        }
        return ret;
 }
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
        struct bio *bio = NULL;
        unsigned page_idx;
        struct pagevec pvec;
+        unsigned long bio_flags = 0;
        pagevec_init(&pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
                        if (!pagevec_add(&pvec, page))
                                __pagevec_lru_add(&pvec);
                        __extent_read_full_page(tree, page, get_extent,
-                                                &bio, 0);
+                                                &bio, 0, &bio_flags);
                }
                page_cache_release(page);
        }
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
                __pagevec_lru_add(&pvec);
        BUG_ON(!list_empty(pages));
        if (bio)
-                submit_one_bio(READ, bio, 0);
+                submit_one_bio(READ, bio, 0, bio_flags);
        return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        ret = submit_extent_page(READ, tree, page,
                                         sector, iosize, page_offset, em->bdev,
                                         NULL, 1,
-                                         end_bio_extent_preparewrite, 0);
+                                         end_bio_extent_preparewrite, 0,
+                                         0, 0);
                        iocount++;
                        block_start = block_start + iosize;
                } else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        }
                        if (!test_range_bit(tree, em->start,
                                            extent_map_end(em) - 1,
-                                            EXTENT_LOCKED, 0)) {
+                                            EXTENT_LOCKED | EXTENT_WRITEBACK |
+                                            EXTENT_ORDERED,
+                                            0)) {
                                remove_extent_mapping(map, em);
                                /* once for the rb tree */
                                free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        int inc_all_pages = 0;
        unsigned long num_pages;
        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
        if (eb->flags & EXTENT_UPTODATE)
                return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
-                                                      mirror_num);
+                                                      mirror_num, &bio_flags);
                        if (err) {
                                ret = err;
                                printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        }
        if (bio)
-                submit_one_bio(READ, bio, mirror_num);
+                submit_one_bio(READ, bio, mirror_num, bio_flags);
        if (ret || !wait) {
                if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae3..86f859b87a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
 #define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
 /*
 * page->private values.  Every page that is controlled by the extent
 * map has page->private set to one.
@@ -28,14 +31,17 @@
 struct extent_state;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-                                       struct bio *bio, int mirror_num);
+                                       struct bio *bio, int mirror_num,
+                                       unsigned long bio_flags);
 struct extent_io_ops {
-        int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started);
        int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
        extent_submit_bio_hook_t *submit_bio_hook;
        int (*merge_bio_hook)(struct page *page, unsigned long offset,
-                              size_t size, struct bio *bio);
+                              size_t size, struct bio *bio,
+                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
                                       u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
                          u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int clear_dirty, int set_writeback,
+                                int clear_writeback);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d3..fd3ebfb8c3c5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
                return 0;
+        /*
+         * don't merge compressed extents, we need to know their
+         * actual size
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+                return 0;
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
                if (rb && mergable_maps(merge, em)) {
                        em->start = merge->start;
                        em->len += merge->len;
+                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
                        rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
                merge = rb_entry(rb, struct extent_map, rb_node);
        if (rb && mergable_maps(em, merge)) {
                em->len += merge->len;
+                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
                free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b268..abbcbeb28c79 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
 struct extent_map {
        struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
        u64 start;
        u64 len;
        u64 block_start;
+        u64 block_len;
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
 static inline u64 extent_map_block_end(struct extent_map *em)
 {
-        if (em->block_start + em->len < em->block_start)
+        if (em->block_start + em->block_len < em->block_start)
                return (u64)-1;
-        return em->block_start + em->len;
+        return em->block_start + em->block_len;
 }
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d4..f4d3fa71bc41 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
                             u64 disk_offset, u64 disk_num_bytes,
-                             u64 num_bytes, u64 offset)
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding)
 {
        int ret = 0;
        struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
        btrfs_set_file_extent_offset(leaf, item, offset);
        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+        btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
        btrfs_set_file_extent_generation(leaf, item, trans->transid);
        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+        btrfs_set_file_extent_compression(leaf, item, compression);
+        btrfs_set_file_extent_encryption(leaf, item, encryption);
+        btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
        btrfs_mark_buffer_dirty(leaf);
 out:
        btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
        return 0;
 }
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                          u64 start, unsigned long len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        char *data;
+        struct page *page;
+        unsigned long total_bytes = 0;
+        unsigned long this_sum_bytes = 0;
+        sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+        if (!sums)
+                return -ENOMEM;
+        sector_sum = sums->sums;
+        sums->file_offset = start;
+        sums->len = len;
+        INIT_LIST_HEAD(&sums->list);
+        ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+        BUG_ON(!ordered);
+        while(len > 0) {
+                if (start >= ordered->file_offset + ordered->len ||
+                    start < ordered->file_offset) {
+                        sums->len = this_sum_bytes;
+                        this_sum_bytes = 0;
+                        btrfs_add_ordered_sum(inode, ordered, sums);
+                        btrfs_put_ordered_extent(ordered);
+                        sums = kzalloc(btrfs_ordered_sum_size(root, len),
+                                       GFP_NOFS);
+                        BUG_ON(!sums);
+                        sector_sum = sums->sums;
+                        sums->len = len;
+                        sums->file_offset = start;
+                        ordered = btrfs_lookup_ordered_extent(inode,
+                                                      sums->file_offset);
+                        BUG_ON(!ordered);
+                }
+                page = find_get_page(inode->i_mapping,
+                                     start >> PAGE_CACHE_SHIFT);
+                data = kmap_atomic(page, KM_USER0);
+                sector_sum->sum = ~(u32)0;
+                sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+                                                  PAGE_CACHE_SIZE);
+                kunmap_atomic(data, KM_USER0);
+                btrfs_csum_final(sector_sum->sum,
+                                 (char *)&sector_sum->sum);
+                sector_sum->offset = page_offset(page);
+                page_cache_release(page);
+                sector_sum++;
+                total_bytes += PAGE_CACHE_SIZE;
+                this_sum_bytes += PAGE_CACHE_SIZE;
+                start += PAGE_CACHE_SIZE;
+                WARN_ON(len < PAGE_CACHE_SIZE);
+                len -= PAGE_CACHE_SIZE;
+        }
+        btrfs_add_ordered_sum(inode, ordered, sums);
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio)
 {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add2..0aa15436590e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
        }
 }
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, struct inode *inode,
-                                u64 offset, size_t size,
-                                struct page **pages, size_t page_offset,
-                                int num_pages)
-{
-        struct btrfs_key key;
-        struct btrfs_path *path;
-        struct extent_buffer *leaf;
-        char *kaddr;
-        unsigned long ptr;
-        struct btrfs_file_extent_item *ei;
-        struct page *page;
-        u32 datasize;
-        int err = 0;
-        int ret;
-        int i;
-        ssize_t cur_size;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        btrfs_set_trans_block_group(trans, inode);
-        key.objectid = inode->i_ino;
-        key.offset = offset;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-        if (ret < 0) {
-                err = ret;
-                goto fail;
-        }
-        if (ret == 1) {
-                struct btrfs_key found_key;
-                if (path->slots[0] == 0)
-                        goto insert;
-                path->slots[0]--;
-                leaf = path->nodes[0];
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != inode->i_ino)
-                        goto insert;
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        goto insert;
-                ei = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, ei) !=
-                    BTRFS_FILE_EXTENT_INLINE) {
-                        goto insert;
-                }
-                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                ret = 0;
-        }
-        if (ret == 0) {
-                u32 found_size;
-                u64 found_end;
-                leaf = path->nodes[0];
-                ei = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, ei) !=
-                    BTRFS_FILE_EXTENT_INLINE) {
-                        err = ret;
-                        btrfs_print_leaf(root, leaf);
-                        printk("found wasn't inline offset %Lu inode %lu\n",
-                               offset, inode->i_ino);
-                        goto fail;
-                }
-                found_size = btrfs_file_extent_inline_len(leaf,
-                                          btrfs_item_nr(leaf, path->slots[0]));
-                found_end = key.offset + found_size;
-                if (found_end < offset + size) {
-                        btrfs_release_path(root, path);
-                        ret = btrfs_search_slot(trans, root, &key, path,
-                                                offset + size - found_end, 1);
-                        BUG_ON(ret != 0);
-                        ret = btrfs_extend_item(trans, root, path,
-                                                offset + size - found_end);
-                        if (ret) {
-                                err = ret;
-                                goto fail;
-                        }
-                        leaf = path->nodes[0];
-                        ei = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_file_extent_item);
-                        inode_add_bytes(inode, offset + size - found_end);
-                }
-                if (found_end < offset) {
-                        ptr = btrfs_file_extent_inline_start(ei) + found_size;
-                        memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-                }
-        } else {
-insert:
-                btrfs_release_path(root, path);
-                datasize = offset + size - key.offset;
-                inode_add_bytes(inode, datasize);
-                datasize = btrfs_file_extent_calc_inline_size(datasize);
-                ret = btrfs_insert_empty_item(trans, root, path, &key,
-                                              datasize);
-                if (ret) {
-                        err = ret;
-                        printk("got bad ret %d\n", ret);
-                        goto fail;
-                }
-                leaf = path->nodes[0];
-                ei = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-                btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-        }
-        ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-        cur_size = size;
-        i = 0;
-        while (size > 0) {
-                page = pages[i];
-                kaddr = kmap_atomic(page, KM_USER0);
-                cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-                write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-                kunmap_atomic(kaddr, KM_USER0);
-                page_offset = 0;
-                ptr += cur_size;
-                size -= cur_size;
-                if (i >= num_pages) {
-                        printk("i %d num_pages %d\n", i, num_pages);
-                }
-                i++;
-        }
-        btrfs_mark_buffer_dirty(leaf);
-fail:
-        btrfs_free_path(path);
-        return err;
-}
 /*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
        u64 start_pos;
        u64 end_of_last_block;
        u64 end_pos = pos + write_bytes;
-        u64 inline_size;
-        int did_inline = 0;
        loff_t isize = i_size_read(inode);
        start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                        err = btrfs_insert_file_extent(trans, root,
                                                       inode->i_ino,
                                                       last_pos_in_file,
-                                                       0, 0, hole_size, 0);
+                                                       0, 0, hole_size, 0,
+                                                       hole_size, 0, 0, 0);
                        btrfs_drop_extent_cache(inode, last_pos_in_file,
                                        last_pos_in_file + hole_size - 1, 0);
                        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                        goto failed;
        }
-        /*
+        /* check for reserved extents on each page, we don't want
-         * either allocate an extent for the new bytes or setup the key
+         * to reset the delalloc bit on things that already have
-         * to show we are doing inline data in the extent
+         * extents reserved.
         */
-        inline_size = end_pos;
+        btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-        if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+        for (i = 0; i < num_pages; i++) {
-            inline_size > root->fs_info->max_inline ||
+                struct page *p = pages[i];
-            (inline_size & (root->sectorsize -1)) == 0 ||
+                SetPageUptodate(p);
-            inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+                ClearPageChecked(p);
-                /* check for reserved extents on each page, we don't want
+                set_page_dirty(p);
-                 * to reset the delalloc bit on things that already have
-                 * extents reserved.
-                 */
-                btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-                for (i = 0; i < num_pages; i++) {
-                        struct page *p = pages[i];
-                        SetPageUptodate(p);
-                        ClearPageChecked(p);
-                        set_page_dirty(p);
-                }
-        } else {
-                u64 aligned_end;
-                /* step one, delete the existing extents in this range */
-                aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-                        ~((u64)root->sectorsize - 1);
-                mutex_lock(&BTRFS_I(inode)->extent_mutex);
-                err = btrfs_drop_extents(trans, root, inode, start_pos,
-                                         aligned_end, aligned_end, &hint_byte);
-                if (err)
-                        goto failed;
-                if (isize > inline_size)
-                        inline_size = min_t(u64, isize, aligned_end);
-                inline_size -= start_pos;
-                err = insert_inline_extent(trans, root, inode, start_pos,
-                                           inline_size, pages, 0, num_pages);
-                btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-                BUG_ON(err);
-                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-                /*
-                 * an ugly way to do all the prop accounting around
-                 * the page bits and mapping tags
-                 */
-                set_page_writeback(pages[0]);
-                end_page_writeback(pages[0]);
-                did_inline = 1;
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-                if (did_inline)
-                        BTRFS_I(inode)->disk_i_size = end_pos;
                btrfs_update_inode(trans, root, inode);
        }
 failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        int ret;
        int testend = 1;
        unsigned long flags;
+        int compressed = 0;
        WARN_ON(end < start);
        if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        free_extent_map(em);
                        continue;
                }
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->start = em->start;
                        split->len = start - em->start;
                        split->block_start = em->block_start;
+                        if (compressed)
+                                split->block_len = em->block_len;
+                        else
+                                split->block_len = split->len;
                        split->bdev = em->bdev;
                        split->flags = flags;
                        ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
-                        split->block_start = em->block_start + diff;
+                        if (compressed) {
+                                split->block_len = em->block_len;
+                                split->block_start = em->block_start;
+                        } else {
+                                split->block_len = split->len;
+                                split->block_start = em->block_start + diff;
+                        }
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
                        struct btrfs_item *item;
                        item = btrfs_item_nr(leaf, slot);
                        extent_end = found_key.offset +
-                             btrfs_file_extent_inline_len(leaf, item);
+                             btrfs_file_extent_inline_len(leaf, extent);
                        extent_end = (extent_end + root->sectorsize - 1) &
                                ~((u64)root->sectorsize -1 );
                }
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
        u64 extent_end = 0;
        u64 search_start = start;
        u64 leaf_start;
+        u64 ram_bytes = 0;
+        u8 compression = 0;
+        u8 encryption = 0;
+        u16 other_encoding = 0;
        u64 root_gen;
        u64 root_owner;
        struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int recow;
        int ret;
+        inline_limit = 0;
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
        path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
                        extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
                        found_type = btrfs_file_extent_type(leaf, extent);
+                        compression = btrfs_file_extent_compression(leaf,
+                                                                    extent);
+                        encryption = btrfs_file_extent_encryption(leaf,
+                                                                  extent);
+                        other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                  extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG) {
                                extent_end =
                                     btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
                                extent_end = key.offset +
                                     btrfs_file_extent_num_bytes(leaf, extent);
+                                ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                                extent);
                                found_extent = 1;
                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                                struct btrfs_item *item;
-                                item = btrfs_item_nr(leaf, slot);
                                found_inline = 1;
                                extent_end = key.offset +
-                                     btrfs_file_extent_inline_len(leaf, item);
+                                     btrfs_file_extent_inline_len(leaf, extent);
                        }
                } else {
                        extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
                        search_start = (extent_end + mask) & ~mask;
                } else
                        search_start = extent_end;
-                if (end <= extent_end && start >= key.offset && found_inline) {
+                if (end <= extent_end && start >= key.offset && found_inline)
                        *hint_byte = EXTENT_MAP_INLINE;
-                        goto out;
-                }
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
                        write_extent_buffer(leaf, &old,
                                            (unsigned long)extent, sizeof(old));
+                        btrfs_set_file_extent_compression(leaf, extent,
+                                                          compression);
+                        btrfs_set_file_extent_encryption(leaf, extent,
+                                                         encryption);
+                        btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                             other_encoding);
                        btrfs_set_file_extent_offset(leaf, extent,
                                    le64_to_cpu(old.offset) + end - key.offset);
                        WARN_ON(le64_to_cpu(old.num_bytes) <
                                (extent_end - end));
                        btrfs_set_file_extent_num_bytes(leaf, extent,
                                                        extent_end - end);
+                        /*
+                         * set the ram bytes to the size of the full extent
+                         * before splitting.  This is a worst case flag,
+                         * but its the best we can do because we don't know
+                         * how splitting affects compression
+                         */
+                        btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        ram_bytes);
                        btrfs_set_file_extent_type(leaf, extent,
                                                   BTRFS_FILE_EXTENT_REG);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d6..9797592dc86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
 #include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
+#include "compression.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 /*
 * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -114,57 +116,374 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 }
 /*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode,
+                                u64 start, size_t size, size_t compressed_size,
+                                struct page **compressed_pages)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct page *page = NULL;
+        char *kaddr;
+        unsigned long ptr;
+        struct btrfs_file_extent_item *ei;
+        int err = 0;
+        int ret;
+        size_t cur_size = size;
+        size_t datasize;
+        unsigned long offset;
+        int use_compress = 0;
+        if (compressed_size && compressed_pages) {
+                use_compress = 1;
+                cur_size = compressed_size;
+        }
+        path = btrfs_alloc_path(); if (!path)
+                return -ENOMEM;
+        btrfs_set_trans_block_group(trans, inode);
+        key.objectid = inode->i_ino;
+        key.offset = start;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        inode_add_bytes(inode, size);
+        datasize = btrfs_file_extent_calc_inline_size(cur_size);
+        inode_add_bytes(inode, size);
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      datasize);
+        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                printk("got bad ret %d\n", ret);
+                goto fail;
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+        ptr = btrfs_file_extent_inline_start(ei);
+        if (use_compress) {
+                struct page *cpage;
+                int i = 0;
+                while(compressed_size > 0) {
+                        cpage = compressed_pages[i];
+                        cur_size = min(compressed_size,
+                                       PAGE_CACHE_SIZE);
+                        kaddr = kmap(cpage);
+                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                        kunmap(cpage);
+                        i++;
+                        ptr += cur_size;
+                        compressed_size -= cur_size;
+                }
+                btrfs_set_file_extent_compression(leaf, ei,
+                                                  BTRFS_COMPRESS_ZLIB);
+        } else {
+                page = find_get_page(inode->i_mapping,
+                                     start >> PAGE_CACHE_SHIFT);
+                btrfs_set_file_extent_compression(leaf, ei, 0);
+                kaddr = kmap_atomic(page, KM_USER0);
+                offset = start & (PAGE_CACHE_SIZE - 1);
+                write_extent_buffer(leaf, kaddr + offset, ptr, size);
+                kunmap_atomic(kaddr, KM_USER0);
+                page_cache_release(page);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        BTRFS_I(inode)->disk_i_size = inode->i_size;
+        btrfs_update_inode(trans, root, inode);
+        return 0;
+fail:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct inode *inode, u64 start, u64 end,
+                                 size_t compressed_size,
+                                 struct page **compressed_pages)
+{
+        u64 isize = i_size_read(inode);
+        u64 actual_end = min(end + 1, isize);
+        u64 inline_len = actual_end - start;
+        u64 aligned_end = (end + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+        u64 hint_byte;
+        u64 data_len = inline_len;
+        int ret;
+        if (compressed_size)
+                data_len = compressed_size;
+        if (start > 0 ||
+            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+            (!compressed_size &&
+            (actual_end & (root->sectorsize - 1)) == 0) ||
+            end + 1 < isize ||
+            data_len > root->fs_info->max_inline) {
+                return 1;
+        }
+        mutex_lock(&BTRFS_I(inode)->extent_mutex);
+        ret = btrfs_drop_extents(trans, root, inode, start,
+                                 aligned_end, aligned_end, &hint_byte);
+        BUG_ON(ret);
+        if (isize > actual_end)
+                inline_len = min_t(u64, isize, actual_end);
+        ret = insert_inline_extent(trans, root, inode, start,
+                                   inline_len, compressed_size,
+                                   compressed_pages);
+        BUG_ON(ret);
+        btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+        return 0;
+}
+/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
 */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+                          u64 start, u64 end, int *page_started)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 alloc_hint = 0;
        u64 num_bytes;
+        unsigned long ram_size;
+        u64 orig_start;
+        u64 disk_num_bytes;
        u64 cur_alloc_size;
        u64 blocksize = root->sectorsize;
-        u64 orig_num_bytes;
+        u64 actual_end;
        struct btrfs_key ins;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
+        struct page **pages = NULL;
+        unsigned long nr_pages;
+        unsigned long nr_pages_ret = 0;
+        unsigned long total_compressed = 0;
+        unsigned long total_in = 0;
+        unsigned long max_compressed = 128 * 1024;
+        unsigned long max_uncompressed = 256 * 1024;
+        int i;
+        int will_compress;
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+        orig_start = start;
+        /*
+         * compression made this loop a bit ugly, but the basic idea is to
+         * compress some pages but keep the total size of the compressed
+         * extent relatively small.  If compression is off, this goto target
+         * is never used.
+         */
+again:
+        will_compress = 0;
+        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+        nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+        actual_end = min_t(u64, i_size_read(inode), end + 1);
+        total_compressed = actual_end - start;
+        /* we want to make sure that amount of ram required to uncompress
+         * an extent is reasonable, so we limit the total size in ram
+         * of a compressed extent to 256k
+         */
+        total_compressed = min(total_compressed, max_uncompressed);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
-        orig_num_bytes = num_bytes;
+        disk_num_bytes = num_bytes;
+        total_in = 0;
+        ret = 0;
-        if (alloc_hint == EXTENT_MAP_INLINE)
+        /* we do compression for mount -o compress and when the
-                goto out;
+         * inode has not been flagged as nocompress
+         */
+        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+            btrfs_test_opt(root, COMPRESS)) {
+                WARN_ON(pages);
+                pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                /* we want to make sure the amount of IO required to satisfy
+                 * a random read is reasonably small, so we limit the size
+                 * of a compressed extent to 128k
+                 */
+                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                                                total_compressed, pages,
+                                                nr_pages, &nr_pages_ret,
+                                                &total_in,
+                                                &total_compressed,
+                                                max_compressed);
+                if (!ret) {
+                        unsigned long offset = total_compressed &
+                                (PAGE_CACHE_SIZE - 1);
+                        struct page *page = pages[nr_pages_ret - 1];
+                        char *kaddr;
+                        /* zero the tail end of the last page, we might be
+                         * sending it down to disk
+                         */
+                        if (offset) {
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memset(kaddr + offset, 0,
+                                       PAGE_CACHE_SIZE - offset);
+                                kunmap_atomic(kaddr, KM_USER0);
+                        }
+                        will_compress = 1;
+                }
+        }
+        if (start == 0) {
+                /* lets try to make an inline extent */
+                if (ret || total_in < (end - start + 1)) {
+                        /* we didn't compress the entire range, try
+                         * to make an uncompressed inline extent.  This
+                         * is almost sure to fail, but maybe inline sizes
+                         * will get bigger later
+                         */
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end, 0, NULL);
+                } else {
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end,
+                                                    total_compressed, pages);
+                }
+                if (ret == 0) {
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start, end, NULL,
+                                                     1, 1, 1);
+                        *page_started = 1;
+                        ret = 0;
+                        goto free_pages_out;
+                }
+        }
+        if (will_compress) {
+                /*
+                 * we aren't doing an inline extent round the compressed size
+                 * up to a block size boundary so the allocator does sane
+                 * things
+                 */
+                total_compressed = (total_compressed + blocksize - 1) &
+                        ~(blocksize - 1);
+                /*
+                 * one last check to make sure the compression is really a
+                 * win, compare the page count read with the blocks on disk
+                 */
+                total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                        ~(PAGE_CACHE_SIZE - 1);
+                if (total_compressed >= total_in) {
+                        will_compress = 0;
+                } else {
+                        disk_num_bytes = total_compressed;
+                        num_bytes = total_in;
+                }
+        }
+        if (!will_compress && pages) {
+                /*
+                 * the compression code ran but failed to make things smaller,
+                 * free any pages it allocated and our page pointer array
+                 */
+                for (i = 0; i < nr_pages_ret; i++) {
+                        page_cache_release(pages[i]);
+                }
+                kfree(pages);
+                pages = NULL;
+                total_compressed = 0;
+                nr_pages_ret = 0;
+                /* flag the file so we don't compress in the future */
+                btrfs_set_flag(inode, NOCOMPRESS);
+        }
+        BUG_ON(disk_num_bytes >
+               btrfs_super_total_bytes(&root->fs_info->super_copy));
-        BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
        mutex_lock(&BTRFS_I(inode)->extent_mutex);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-        while(num_bytes > 0) {
+        while(disk_num_bytes > 0) {
-                cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+                unsigned long min_bytes;
+                /*
+                 * the max size of a compressed extent is pretty small,
+                 * make the code a little less complex by forcing
+                 * the allocator to find a whole compressed extent at once
+                 */
+                if (will_compress)
+                        min_bytes = disk_num_bytes;
+                else
+                        min_bytes = root->sectorsize;
+                cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-                                           root->sectorsize, 0, alloc_hint,
+                                           min_bytes, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
                if (ret) {
                        WARN_ON(1);
-                        goto out;
+                        goto free_pages_out_fail;
                }
                em = alloc_extent_map(GFP_NOFS);
                em->start = start;
-                em->len = ins.offset;
+                if (will_compress) {
+                        ram_size = num_bytes;
+                        em->len = num_bytes;
+                } else {
+                        /* ramsize == disk size */
+                        ram_size = ins.offset;
+                        em->len = ins.offset;
+                }
                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                mutex_lock(&BTRFS_I(inode)->extent_mutex);
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                if (will_compress)
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                while(1) {
                        spin_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
                                break;
                        }
                        btrfs_drop_extent_cache(inode, start,
-                                                start + ins.offset - 1, 0);
+                                                start + ram_size - 1, 0);
                }
                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
                cur_alloc_size = ins.offset;
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-                                               ins.offset, 0);
+                                               ram_size, cur_alloc_size, 0,
+                                               will_compress);
                BUG_ON(ret);
-                if (num_bytes < cur_alloc_size) {
-                        printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+                if (disk_num_bytes < cur_alloc_size) {
+                        printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
                               cur_alloc_size);
                        break;
                }
+                if (will_compress) {
+                        /*
+                         * we're doing compression, we and we need to
+                         * submit the compressed extents down to the device.
+                         *
+                         * We lock down all the file pages, clearing their
+                         * dirty bits and setting them writeback.  Everyone
+                         * that wants to modify the page will wait on the
+                         * ordered extent above.
+                         *
+                         * The writeback bits on the file pages are
+                         * cleared when the compressed pages are on disk
+                         */
+                        btrfs_end_transaction(trans, root);
+                        if (start <= page_offset(locked_page) &&
+                            page_offset(locked_page) < start + ram_size) {
+                                *page_started = 1;
+                        }
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start,
+                                                     start + ram_size - 1,
+                                                     NULL, 1, 1, 0);
+                        ret = btrfs_submit_compressed_write(inode, start,
+                                                 ram_size, ins.objectid,
+                                                 cur_alloc_size, pages,
+                                                 nr_pages_ret);
+                        BUG_ON(ret);
+                        trans = btrfs_join_transaction(root, 1);
+                        if (start + ram_size < end) {
+                                start += ram_size;
+                                alloc_hint = ins.objectid + ins.offset;
+                                /* pages will be freed at end_bio time */
+                                pages = NULL;
+                                goto again;
+                        } else {
+                                /* we've written everything, time to go */
+                                break;
+                        }
+                }
+                /* we're not doing compressed IO, don't unlock the first
+                 * page (which the caller expects to stay locked), don't
+                 * clear any dirty bits and don't set any writeback bits
+                 */
+                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                             start, start + ram_size - 1,
+                                             locked_page, 0, 0, 0);
+                disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
+        ret = 0;
 out:
        btrfs_end_transaction(trans, root);
        return ret;
+free_pages_out_fail:
+        extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                     start, end, locked_page, 0, 0, 0);
+free_pages_out:
+        for (i = 0; i < nr_pages_ret; i++)
+                page_cache_release(pages[i]);
+        if (pages)
+                kfree(pages);
+        goto out;
 }
 /*
@@ -203,7 +591,8 @@ out:
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started)
 {
        u64 extent_start;
        u64 extent_end;
@@ -260,6 +649,11 @@ again:
                extent_end = extent_start + extent_num_bytes;
                err = 0;
+                if (btrfs_file_extent_compression(leaf, item) ||
+                    btrfs_file_extent_encryption(leaf,item) ||
+                    btrfs_file_extent_other_encoding(leaf, item))
+                        goto not_found;
                if (loops && start != extent_start)
                        goto not_found;
@@ -284,7 +678,8 @@ again:
                bytenr += btrfs_file_extent_offset(leaf, item);
                extent_num_bytes = min(end + 1, extent_end) - start;
                ret = btrfs_add_ordered_extent(inode, start, bytenr,
-                                                extent_num_bytes, 1);
+                                                extent_num_bytes,
+                                                extent_num_bytes, 1, 0);
                if (ret) {
                        err = ret;
                        goto out;
@@ -300,7 +695,8 @@ again:
 not_found:
                btrfs_end_transaction(trans, root);
                btrfs_free_path(path);
-                return cow_file_range(inode, start, end);
+                return cow_file_range(inode, locked_page, start, end,
+                                      page_started);
        }
 out:
        WARN_ON(err);
@@ -312,16 +708,19 @@ out:
 /*
 * extent_io.c call back to do delayed allocation processing
 */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
        if (btrfs_test_opt(root, NODATACOW) ||
            btrfs_test_flag(inode, NODATACOW))
-                ret = run_delalloc_nocow(inode, start, end);
+                ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                         page_started);
        else
-                ret = cow_file_range(inode, start, end);
+                ret = cow_file_range(inode, locked_page, start, end,
+                                     page_started);
        return ret;
 }
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 * we don't create bios that span stripes or chunks
 */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                         size_t size, struct bio *bio)
+                         size_t size, struct bio *bio,
+                         unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 * are inserted into the btree
 */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num)
+                          int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 * or reading the csums from the tree before a read
 */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num)
+                          int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        if (!(rw & (1 << BIO_RW))) {
                btrfs_lookup_bio_sums(root, inode, bio);
+                if (bio_flags & EXTENT_BIO_COMPRESSED) {
+                        return btrfs_submit_compressed_read(inode, bio,
+                                                    mirror_num, bio_flags);
+                }
                goto mapit;
        }
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                   __btrfs_submit_bio_hook);
+                                   bio_flags, __btrfs_submit_bio_hook);
 mapit:
        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
@@ -539,7 +945,7 @@ out_page:
 * good idea.  This causes problems because we want to make sure COW
 * properly happens and the data=ordered rules are followed.
 *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
 * hasn't been properly setup for IO.  We kick off an async process
 * to fix it up.  The async helper will wait for ordered extents, set
 * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
                                          ordered_extent->start);
        btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-                                             ordered_extent->len);
+                                             ordered_extent->disk_len);
        btrfs_set_file_extent_offset(leaf, extent_item, 0);
+        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+                btrfs_set_file_extent_compression(leaf, extent_item, 1);
+        else
+                btrfs_set_file_extent_compression(leaf, extent_item, 0);
+        btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+        btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+        /* ram bytes = extent_num_bytes for now */
        btrfs_set_file_extent_num_bytes(leaf, extent_item,
                                        ordered_extent->len);
+        btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+                                        ordered_extent->len);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
        ins.objectid = ordered_extent->start;
-        ins.offset = ordered_extent->len;
+        ins.offset = ordered_extent->disk_len;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
                                          root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
        int ret;
        int rw;
        u64 logical;
+        unsigned long bio_flags = 0;
        ret = get_state_private(failure_tree, start, &private);
        if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
                }
                logical = start - em->start;
                logical = em->block_start + logical;
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                        bio_flags = EXTENT_BIO_COMPRESSED;
                failrec->logical = logical;
                free_extent_map(em);
                set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
                rw = READ;
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-                                                      failrec->last_mirror);
+                                                      failrec->last_mirror,
+                                                      bio_flags);
        return 0;
 }
@@ -1644,10 +2065,8 @@ search_again:
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                                struct btrfs_item *item = btrfs_item_nr(leaf,
-                                                                path->slots[0]);
                                item_end += btrfs_file_extent_inline_len(leaf,
-                                                                         item);
+                                                                         fi);
                        }
                        item_end--;
                }
@@ -1715,7 +2134,14 @@ search_again:
                                root_owner = btrfs_header_owner(leaf);
                        }
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                        if (!del_item) {
+                        /*
+                         * we can't truncate inline items that have had
+                         * special encodings
+                         */
+                        if (!del_item &&
+                            btrfs_file_extent_compression(leaf, fi) == 0 &&
+                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
                                u32 size = new_size - found_key.offset;
                                if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                        err = btrfs_insert_file_extent(trans, root,
                                                       inode->i_ino,
                                                       hole_start, 0, 0,
-                                                       hole_size, 0);
+                                                       hole_size, 0, hole_size,
+                                                       0, 0, 0);
                        btrfs_drop_extent_cache(inode, hole_start,
                                                (u64)-1, 0);
                        btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
        start_diff = map_start - em->start;
        em->start = map_start;
        em->len = map_len;
-        if (em->block_start < EXTENT_MAP_LAST_BYTE)
+        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
+                em->block_len -= start_diff;
+        }
        return add_extent_mapping(em_tree, em);
 }
+static noinline int uncompress_inline(struct btrfs_path *path,
+                                      struct inode *inode, struct page *page,
+                                      size_t pg_offset, u64 extent_offset,
+                                      struct btrfs_file_extent_item *item)
+{
+        int ret;
+        struct extent_buffer *leaf = path->nodes[0];
+        char *tmp;
+        size_t max_size;
+        unsigned long inline_size;
+        unsigned long ptr;
+        WARN_ON(pg_offset != 0);
+        max_size = btrfs_file_extent_ram_bytes(leaf, item);
+        inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                        btrfs_item_nr(leaf, path->slots[0]));
+        tmp = kmalloc(inline_size, GFP_NOFS);
+        ptr = btrfs_file_extent_inline_start(item);
+        read_extent_buffer(leaf, tmp, ptr, inline_size);
+        max_size = min(PAGE_CACHE_SIZE, max_size);
+        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+                                    inline_size, max_size);
+        if (ret) {
+                char *kaddr = kmap_atomic(page, KM_USER0);
+                unsigned long copy_size = min_t(u64,
+                                  PAGE_CACHE_SIZE - pg_offset,
+                                  max_size - extent_offset);
+                memset(kaddr + pg_offset, 0, copy_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        kfree(tmp);
+        return 0;
+}
 /*
 * a bit scary, this does extent mapping from logical file offset to the disk.
 * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
+        int compressed;
 again:
        spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
        em->bdev = root->fs_info->fs_devices->latest_bdev;
        em->start = EXTENT_MAP_HOLE;
        em->len = (u64)-1;
+        em->block_len = (u64)-1;
        if (!path) {
                path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
+        compressed = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG) {
                extent_end = extent_start +
                       btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-                bytenr += btrfs_file_extent_offset(leaf, item);
-                em->block_start = bytenr;
                em->start = extent_start;
                em->len = extent_end - extent_start;
+                if (compressed) {
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->block_start = bytenr;
+                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                         item);
+                } else {
+                        bytenr += btrfs_file_extent_offset(leaf, item);
+                        em->block_start = bytenr;
+                        em->block_len = em->len;
+                }
                goto insert;
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                u64 page_start;
@@ -3018,8 +3495,7 @@ again:
                size_t extent_offset;
                size_t copy_size;
-                size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
+                size = btrfs_file_extent_inline_len(leaf, item);
-                                                    path->slots[0]));
                extent_end = (extent_start + size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
                }
                em->block_start = EXTENT_MAP_INLINE;
-                if (!page) {
+                if (!page || create) {
                        em->start = extent_start;
-                        em->len = size;
+                        em->len = (size + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
                        goto out;
                }
@@ -3048,11 +3525,22 @@ again:
                em->start = extent_start + extent_offset;
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-                map = kmap(page);
+                if (compressed)
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                        read_extent_buffer(leaf, map + pg_offset, ptr,
+                        if (btrfs_file_extent_compression(leaf, item) ==
-                                           copy_size);
+                            BTRFS_COMPRESS_ZLIB) {
+                                ret = uncompress_inline(path, inode, page,
+                                                        pg_offset,
+                                                        extent_offset, item);
+                                BUG_ON(ret);
+                        } else {
+                                map = kmap(page);
+                                read_extent_buffer(leaf, map + pg_offset, ptr,
+                                                   copy_size);
+                                kunmap(page);
+                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
                        if (!trans) {
@@ -3063,11 +3551,12 @@ again:
                                trans = btrfs_join_transaction(root, 1);
                                goto again;
                        }
+                        map = kmap(page);
                        write_extent_buffer(leaf, map + pg_offset, ptr,
                                            copy_size);
+                        kunmap(page);
                        btrfs_mark_buffer_dirty(leaf);
                }
-                kunmap(page);
                set_extent_uptodate(io_tree, em->start,
                                    extent_map_end(em) - 1, GFP_NOFS);
                goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
        btrfs_set_file_extent_type(leaf, ei,
                                   BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_compression(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
        ptr = btrfs_file_extent_inline_start(ei);
        write_extent_buffer(leaf, symname, ptr, name_len);
        btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c2..b5745bb96d40 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 * inserted.
 */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, int nocow)
+                             u64 start, u64 len, u64 disk_len, int nocow,
+                             int compressed)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
+        entry->disk_len = disk_len;
        entry->inode = inode;
        if (nocow)
                set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+        if (compressed)
+                set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * for pdflush to find them
         */
        btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-        if (wait)
+        if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
+        }
 }
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a144..1ef464145d22 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
        /* disk byte number */
        u64 start;
-        /* length of the extent in bytes */
+        /* ram length of the extent in bytes */
        u64 len;
+        /* extent length on disk */
+        u64 disk_len;
        /* flags (described above) */
        unsigned long flags;
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, int nocow);
+                             u64 start, u64 len, u64 disk_len, int nocow,
+                             int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f2..64725c13aa11 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                        if (btrfs_file_extent_type(l, fi) ==
                            BTRFS_FILE_EXTENT_INLINE) {
                                printk("\t\tinline extent data size %u\n",
-                                   btrfs_file_extent_inline_len(l, item));
+                                   btrfs_file_extent_inline_len(l, fi));
                                break;
                        }
                        printk("\t\textent data disk bytenr %llu nr %llu\n",
                               (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
                               (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-                        printk("\t\textent data offset %llu nr %llu\n",
+                        printk("\t\textent data offset %llu nr %llu ram %llu\n",
                          (unsigned long long)btrfs_file_extent_offset(l, fi),
-                          (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+                          (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+                          (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
                        break;
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7b..431fdf144b58 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
 #include "volumes.h"
 #include "version.h"
 #include "export.h"
+#include "compression.h"
 #define BTRFS_SUPER_MAGIC 0x9123683E
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
 };
 static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
        {Opt_max_inline, "max_inline=%s"},
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
+        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
        {Opt_noacl, "noacl"},
        {Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
+                case Opt_compress:
+                        printk(KERN_INFO "btrfs: use compression\n");
+                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                        btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
        err = btrfs_interface_init();
        if (err)
                goto free_extent_map;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
+        btrfs_zlib_exit();
 }
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34a..e6d579053a47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        if (found_type == BTRFS_FILE_EXTENT_REG)
                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                size = btrfs_file_extent_inline_len(eb,
+                size = btrfs_file_extent_inline_len(eb, item);
-                                                    btrfs_item_nr(eb, slot));
                extent_end = (start + size + mask) & ~mask;
        } else {
                ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51a..7db4cfd03a98 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
        em->start = key.offset;
        em->len = *num_bytes;
        em->block_start = 0;
+        em->block_len = em->len;
        if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        em->start = logical;
        em->len = length;
        em->block_start = 0;
+        em->block_len = em->len;
        map->num_stripes = num_stripes;
        map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+/* Plan: call deflate() with avail_in == *sourcelen,
+        avail_out = *dstlen - 12 and flush == Z_FINISH.
+        If it doesn't manage to finish, call it again with
+        avail_in == 0 and avail_out set to the remaining 12
+        bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+struct workspace {
+        z_stream inf_strm;
+        z_stream def_strm;
+        char *buf;
+        struct list_head list;
+};
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+        struct workspace *workspace;
+        int ret;
+        int cpus = num_online_cpus();
+again:
+        spin_lock(&workspace_lock);
+        if (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                num_workspace--;
+                spin_unlock(&workspace_lock);
+                return workspace;
+        }
+        spin_unlock(&workspace_lock);
+        if (atomic_read(&alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(&alloc_workspace) > cpus)
+                        schedule();
+                finish_wait(&workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(&alloc_workspace);
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        if (!workspace->def_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+        if (!workspace->inf_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail_inflate;
+        }
+        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+        if (!workspace->buf) {
+                ret = -ENOMEM;
+                goto fail_kmalloc;
+        }
+        return workspace;
+fail_kmalloc:
+        vfree(workspace->inf_strm.workspace);
+fail_inflate:
+        vfree(workspace->def_strm.workspace);
+fail:
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        wake_up(&workspace_wait);
+        return ERR_PTR(ret);
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+        spin_lock(&workspace_lock);
+        if (num_workspace < num_online_cpus()) {
+                list_add_tail(&workspace->list, &idle_workspace);
+                num_workspace++;
+                spin_unlock(&workspace_lock);
+                if (waitqueue_active(&workspace_wait))
+                        wake_up(&workspace_wait);
+                return 0;
+        }
+        spin_unlock(&workspace_lock);
+        vfree(workspace->def_strm.workspace);
+        vfree(workspace->inf_strm.workspace);
+        kfree(workspace->buf);
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        if (waitqueue_active(&workspace_wait))
+                wake_up(&workspace_wait);
+        return 0;
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct workspace *workspace;
+        while(!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                vfree(workspace->def_strm.workspace);
+                vfree(workspace->inf_strm.workspace);
+                kfree(workspace->buf);
+                kfree(workspace);
+                atomic_dec(&alloc_workspace);
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        int ret;
+        struct workspace *workspace;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        int out_written = 0;
+        int in_read = 0;
+        unsigned long bytes_left;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -1;
+        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+                printk(KERN_WARNING "deflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        workspace->def_strm.total_in = 0;
+        workspace->def_strm.total_out = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        cpage_out = kmap(out_page);
+        pages[0] = out_page;
+        nr_pages = 1;
+        workspace->def_strm.next_in = data_in;
+        workspace->def_strm.next_out = cpage_out;
+        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+        out_written = 0;
+        in_read = 0;
+        while (workspace->def_strm.total_in < len) {
+                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+                if (ret != Z_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        zlib_deflateEnd(&workspace->def_strm);
+                        ret = -1;
+                        goto out;
+                }
+                /* we're making it bigger, give up */
+                if (workspace->def_strm.total_in > 8192 &&
+                    workspace->def_strm.total_in <
+                    workspace->def_strm.total_out) {
+                        ret = -1;
+                        goto out;
+                }
+                /* we need another page for writing out.  Test this
+                 * before the total_in so we will pull in a new page for
+                 * the stream end if required
+                 */
+                if (workspace->def_strm.avail_out == 0) {
+                        kunmap(out_page);
+                        if (nr_pages == nr_dest_pages) {
+                                out_page = NULL;
+                                ret = -1;
+                                goto out;
+                        }
+                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        cpage_out = kmap(out_page);
+                        pages[nr_pages] = out_page;
+                        nr_pages++;
+                        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                        workspace->def_strm.next_out = cpage_out;
+                }
+                /* we're all done */
+                if (workspace->def_strm.total_in >= len)
+                        break;
+                /* we've read in a full page, get a new one */
+                if (workspace->def_strm.avail_in == 0) {
+                        if (workspace->def_strm.total_out > max_out)
+                                break;
+                        bytes_left = len - workspace->def_strm.total_in;
+                        kunmap(in_page);
+                        page_cache_release(in_page);
+                        start += PAGE_CACHE_SIZE;
+                        in_page = find_get_page(mapping,
+                                                start >> PAGE_CACHE_SHIFT);
+                        data_in = kmap(in_page);
+                        workspace->def_strm.avail_in = min(bytes_left,
+                                                           PAGE_CACHE_SIZE);
+                        workspace->def_strm.next_in = data_in;
+                }
+        }
+        workspace->def_strm.avail_in = 0;
+        ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+        zlib_deflateEnd(&workspace->def_strm);
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+                goto out;
+        }
+        if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+                ret = -1;
+                goto out;
+        }
+        ret = 0;
+        *total_out = workspace->def_strm.total_out;
+        *total_in = workspace->def_strm.total_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        char *data_in;
+        size_t total_out = 0;
+        unsigned long page_bytes_left;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        struct page *page_out;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        unsigned long start_byte;
+        unsigned long current_buf_start;
+        char *kaddr;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        data_in = kmap(pages_in[page_in_index]);
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.total_out = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        page_out = bvec[page_out_index].bv_page;
+        page_bytes_left = PAGE_CACHE_SIZE;
+        pg_offset = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while(workspace->inf_strm.total_in < srclen) {
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END) {
+                        break;
+                }
+                /*
+                 * buf start is the byte offset we're of the start of
+                 * our workspace buffer
+                 */
+                buf_start = total_out;
+                /* total_out is the last byte of the workspace buffer */
+                total_out = workspace->inf_strm.total_out;
+                working_bytes = total_out - buf_start;
+                /*
+                 * start byte is the first byte of the page we're currently
+                 * copying into relative to the start of the compressed data.
+                 */
+                start_byte = page_offset(page_out) - disk_start;
+                if (working_bytes == 0) {
+                        /* we didn't make progress in this inflate
+                         * call, we're done
+                         */
+                        if (ret != Z_STREAM_END)
+                                ret = -1;
+                        break;
+                }
+                /* we haven't yet hit data corresponding to this page */
+                if (total_out <= start_byte) {
+                        goto next;
+                }
+                /*
+                 * the start of the data we care about is offset into
+                 * the middle of our working buffer
+                 */
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                        working_bytes -= buf_offset;
+                } else {
+                        buf_offset = 0;
+                }
+                current_buf_start = buf_start;
+                /* copy bytes from the working buffer into the pages */
+                while(working_bytes > 0) {
+                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                    PAGE_CACHE_SIZE - buf_offset);
+                        bytes = min(bytes, working_bytes);
+                        kaddr = kmap_atomic(page_out, KM_USER0);
+                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                               bytes);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page_out);
+                        pg_offset += bytes;
+                        page_bytes_left -= bytes;
+                        buf_offset += bytes;
+                        working_bytes -= bytes;
+                        current_buf_start += bytes;
+                        /* check if we need to pick another page */
+                        if (page_bytes_left == 0) {
+                                page_out_index++;
+                                if (page_out_index >= vcnt) {
+                                        ret = 0;
+                                        goto done;
+                                }
+                                page_out = bvec[page_out_index].bv_page;
+                                pg_offset = 0;
+                                page_bytes_left = PAGE_CACHE_SIZE;
+                                start_byte = page_offset(page_out) - disk_start;
+                                /*
+                                 * make sure our new page is covered by this
+                                 * working buffer
+                                 */
+                                if (total_out <= start_byte) {
+                                        goto next;
+                                }
+                                /* the next page in the biovec might not
+                                 * be adjacent to the last page, but it
+                                 * might still be found inside this working
+                                 * buffer.  bump our offset pointer
+                                 */
+                                if (total_out > start_byte &&
+                                    current_buf_start < start_byte) {
+                                        buf_offset = start_byte - buf_start;
+                                        working_bytes = total_out - start_byte;
+                                        current_buf_start = buf_start +
+                                                buf_offset;
+                                }
+                        }
+                }
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                if (workspace->inf_strm.avail_in == 0) {
+                        unsigned long tmp;
+                        kunmap(pages_in[page_in_index]);
+                        page_in_index++;
+                        if (page_in_index >= total_pages_in) {
+                                data_in = NULL;
+                                break;
+                        }
+                        data_in = kmap(pages_in[page_in_index]);
+                        workspace->inf_strm.next_in = data_in;
+                        tmp = srclen - workspace->inf_strm.total_in;
+                        workspace->inf_strm.avail_in = min(tmp,
+                                                           PAGE_CACHE_SIZE);
+                }
+        }
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+        } else {
+                ret = 0;
+        }
+done:
+        zlib_inflateEnd(&workspace->inf_strm);
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        unsigned long bytes_left = destlen;
+        unsigned long total_out = 0;
+        char *kaddr;
+        if (destlen > PAGE_CACHE_SIZE)
+                return -ENOMEM;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = srclen;
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->inf_strm.total_out = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while(bytes_left > 0) {
+                unsigned long buf_start;
+                unsigned long buf_offset;
+                unsigned long bytes;
+                unsigned long pg_offset = 0;
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END) {
+                        break;
+                }
+                buf_start = total_out;
+                total_out = workspace->inf_strm.total_out;
+                if (total_out == buf_start) {
+                        ret = -1;
+                        break;
+                }
+                if (total_out <= start_byte) {
+                        goto next;
+                }
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                } else {
+                        buf_offset = 0;
+                }
+                bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, bytes_left);
+                kaddr = kmap_atomic(dest_page, KM_USER0);
+                memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                pg_offset += bytes;
+                bytes_left -= bytes;
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        }
+        if (ret != Z_STREAM_END && bytes_left != 0) {
+                ret = -1;
+        } else {
+                ret = 0;
+        }
+        zlib_inflateEnd(&workspace->inf_strm);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}