140 files changed, 6377 insertions, 2002 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/removed/o2cb
index 9c49d8e6c0cc..7f5daa465093 100644
--- a/Documentation/ABI/obsolete/o2cb
+++ b/Documentation/ABI/removed/o2cb
@@ -1,11 +1,10 @@
 What:           /sys/o2cb symlink
-Date:           Dec 2005
+Date:           May 2011
-KernelVersion:  2.6.16
+KernelVersion:  2.6.40
 Contact:        ocfs2-devel@oss.oracle.com
-Description:    This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
+Description:    This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
-                be removed when new versions of ocfs2-tools which know to look
+                removed when new versions of ocfs2-tools which know to look
                in /sys/fs/o2cb are sufficiently prevalent. Don't code new
                software to look here, it should try /sys/fs/o2cb instead.
-                See Documentation/ABI/stable/o2cb for more information on usage.
 Users:          ocfs2-tools. It's sufficient to mail proposed changes to
                ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
new file mode 100644
index 000000000000..662ae646ea12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
@@ -0,0 +1,11 @@
+What:           /sys/kernel/mm/cleancache/
+Date:           April 2011
+Contact:        Dan Magenheimer <dan.magenheimer@oracle.com>
+Description:
+                /sys/kernel/mm/cleancache/ contains a number of files which
+                record a count of various cleancache operations
+                (sum across all filesystems):
+                        succ_gets
+                        failed_gets
+                        puts
+                        flushes
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 95788ad2506c..ff31b1cc50aa 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -262,16 +262,6 @@ Who:	Michael Buesch <mb@bu3sch.de>
 ---------------------------
-What:   /sys/o2cb symlink
-When:   January 2010
-Why:    /sys/fs/o2cb is the proper location for this information - /sys/o2cb
-        exists as a symlink for backwards compatibility for old versions of
-        ocfs2-tools. 2 years should be sufficient time to phase in new versions
-        which know to look in /sys/fs/o2cb.
-Who:    ocfs2-devel@oss.oracle.com
---------------------------
 What:   Ability for non root users to shm_get hugetlb pages based on mlock
        resource limits
 When:   2.6.31
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index c79ec58fd7f6..3ae9bc94352a 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -226,10 +226,6 @@ acl			Enables POSIX Access Control Lists support.
 noacl                   This option disables POSIX Access Control List
                        support.
-reservation
-noreservation
 bsddf           (*)     Make 'df' act like BSD.
 minixdf                 Make 'df' act like Minix.
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 9ed920a8cd79..7618a287aa41 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -46,9 +46,15 @@ errors=panic		Panic and halt the machine if an error occurs.
 intr            (*)     Allow signals to interrupt cluster operations.
 nointr                  Do not allow signals to interrupt cluster
                        operations.
+noatime                 Do not update access time.
+relatime(*)             Update atime if the previous atime is older than
+                        mtime or ctime
+strictatime             Always update atime, but the minimum update interval
+                        is specified by atime_quantum.
 atime_quantum=60(*)     OCFS2 will not update atime unless this number
                        of seconds has passed since the last update.
-                        Set to zero to always update atime.
+                        Set to zero to always update atime. This option need
+                        work with strictatime.
 data=ordered    (*)     All data are forced directly out to the main file
                        system prior to its metadata being committed to the
                        journal.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 7bff3e4f35df..3fc0c31a6f5d 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -39,6 +39,12 @@ When mounting an XFS filesystem, the following options are accepted.
        drive level write caching to be enabled, for devices that
        support write barriers.
+  discard
+        Issue command to let the block device reclaim space freed by the
+        filesystem.  This is useful for SSD devices, thinly provisioned
+        LUNs and virtual machine images, but may have a performance
+        impact.  This option is incompatible with the nodelaylog option.
  dmapi
        Enable the DMAPI (Data Management API) event callouts.
        Use with the "mtpt" option.
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
new file mode 100644
index 000000000000..36c367c73084
--- /dev/null
+++ b/Documentation/vm/cleancache.txt
@@ -0,0 +1,278 @@
+MOTIVATION
+Cleancache is a new optional feature provided by the VFS layer that
+potentially dramatically increases page cache effectiveness for
+many workloads in many environments at a negligible cost.
+Cleancache can be thought of as a page-granularity victim cache for clean
+pages that the kernel's pageframe replacement algorithm (PFRA) would like
+to keep around, but can't since there isn't enough memory.  So when the
+PFRA "evicts" a page, it first attempts to use cleancache code to
+put the data contained in that page into "transcendent memory", memory
+that is not directly accessible or addressable by the kernel and is
+of unknown and possibly time-varying size.
+Later, when a cleancache-enabled filesystem wishes to access a page
+in a file on disk, it first checks cleancache to see if it already
+contains it; if it does, the page of data is copied into the kernel
+and a disk access is avoided.
+Transcendent memory "drivers" for cleancache are currently implemented
+in Xen (using hypervisor memory) and zcache (using in-kernel compressed
+memory) and other implementations are in development.
+FAQs are included below.
+IMPLEMENTATION OVERVIEW
+A cleancache "backend" that provides transcendent memory registers itself
+to the kernel's cleancache "frontend" by calling cleancache_register_ops,
+passing a pointer to a cleancache_ops structure with funcs set appropriately.
+Note that cleancache_register_ops returns the previous settings so that
+chaining can be performed if desired. The functions provided must conform to
+certain semantics as follows:
+Most important, cleancache is "ephemeral".  Pages which are copied into
+cleancache have an indefinite lifetime which is completely unknowable
+by the kernel and so may or may not still be in cleancache at any later time.
+Thus, as its name implies, cleancache is not suitable for dirty pages.
+Cleancache has complete discretion over what pages to preserve and what
+pages to discard and when.
+Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
+pool id which, if positive, must be saved in the filesystem's superblock;
+a negative return value indicates failure.  A "put_page" will copy a
+(presumably about-to-be-evicted) page into cleancache and associate it with
+the pool id, a file key, and a page index into the file.  (The combination
+of a pool id, a file key, and an index is sometimes called a "handle".)
+A "get_page" will copy the page, if found, from cleancache into kernel memory.
+A "flush_page" will ensure the page no longer is present in cleancache;
+a "flush_inode" will flush all pages associated with the specified file;
+and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
+all files specified by the given pool id and also surrender the pool id.
+An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
+to treat the pool as shared using a 128-bit UUID as a key.  On systems
+that may run multiple kernels (such as hard partitioned or virtualized
+systems) that may share a clustered filesystem, and where cleancache
+may be shared among those kernels, calls to init_shared_fs that specify the
+same UUID will receive the same pool id, thus allowing the pages to
+be shared.  Note that any security requirements must be imposed outside
+of the kernel (e.g. by "tools" that control cleancache).  Or a
+cleancache implementation can simply disable shared_init by always
+returning a negative value.
+If a get_page is successful on a non-shared pool, the page is flushed (thus
+making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT flushed on a successful get_page so that it remains accessible to
+other sharers.  The kernel is responsible for ensuring coherency between
+cleancache (shared or not), the page cache, and the filesystem, using
+cleancache flush operations as required.
+Note that cleancache must enforce put-put-get coherency and get-get
+coherency.  For the former, if two puts are made to the same handle but
+with different data, say AAA by the first put and BBB by the second, a
+subsequent get can never return the stale data (AAA).  For get-get coherency,
+if a get for a given handle fails, subsequent gets for that handle will
+never succeed unless preceded by a successful put with that handle.
+Last, cleancache provides no SMP serialization guarantees; if two
+different Linux threads are simultaneously putting and flushing a page
+with the same handle, the results are indeterminate.  Callers must
+lock the page to ensure serial behavior.
+CLEANCACHE PERFORMANCE METRICS
+Cleancache monitoring is done by sysfs files in the
+/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
+can be measured (across all filesystems) with:
+succ_gets       - number of gets that were successful
+failed_gets     - number of gets that failed
+puts            - number of puts attempted (all "succeed")
+flushes         - number of flushes attempted
+A backend implementatation may provide additional metrics.
+FAQ
+1) Where's the value? (Andrew Morton)
+Cleancache provides a significant performance benefit to many workloads
+in many environments with negligible overhead by improving the
+effectiveness of the pagecache.  Clean pagecache pages are
+saved in transcendent memory (RAM that is otherwise not directly
+addressable to the kernel); fetching those pages later avoids "refaults"
+and thus disk reads.
+Cleancache (and its sister code "frontswap") provide interfaces for
+this transcendent memory (aka "tmem"), which conceptually lies between
+fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
+Disallowing direct kernel or userland reads/writes to tmem
+is ideal when data is transformed to a different form and size (such
+as with compression) or secretly moved (as might be useful for write-
+balancing for some RAM-like devices).  Evicted page-cache pages (and
+swap pages) are a great use for this kind of slower-than-RAM-but-much-
+faster-than-disk transcendent memory, and the cleancache (and frontswap)
+"page-object-oriented" specification provides a nice way to read and
+write -- and indirectly "name" -- the pages.
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to
+do it well with no kernel change have essentially failed (except in some
+well-publicized special-case workloads).  Cleancache -- and frontswap --
+with a fairly small impact on the kernel, provide a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), page cache pages
+are the first to go, and cleancache allows those pages to be
+saved and reclaimed if overall host system memory conditions allow.
+And the identical interface used for cleancache can be used in
+physical systems as well.  The zcache driver acts as a memory-hungry
+device that stores pages of data in a compressed state.  And
+the proposed "RAMster" driver shares RAM across multiple physical
+systems.
+2) Why does cleancache have its sticky fingers so deep inside the
+   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+The core hooks for cleancache in VFS are in most cases a single line
+and the minimum set are placed precisely where needed to maintain
+coherency (via cleancache_flush operations) between cleancache,
+the page cache, and disk.  All hooks compile into nothingness if
+cleancache is config'ed off and turn into a function-pointer-
+compare-to-NULL if config'ed on but no backend claims the ops
+functions, or to a compare-struct-element-to-negative if a
+backend claims the ops functions but a filesystem doesn't enable
+cleancache.
+Some filesystems are built entirely on top of VFS and the hooks
+in VFS are sufficient, so don't require an "init_fs" hook; the
+initial implementation of cleancache didn't provide this hook.
+But for some filesystems (such as btrfs), the VFS hooks are
+incomplete and one or more hooks in fs-specific code are required.
+And for some other filesystems, such as tmpfs, cleancache may
+be counterproductive.  So it seemed prudent to require a filesystem
+to "opt in" to use cleancache, which requires adding a hook in
+each filesystem.  Not all filesystems are supported by cleancache
+only because they haven't been tested.  The existing set should
+be sufficient to validate the concept, the opt-in approach means
+that untested filesystems are not affected, and the hooks in the
+existing filesystems should make it very easy to add more
+filesystems in the future.
+The total impact of the hooks to existing fs and mm files is only
+about 40 lines added (not counting comments and blank lines).
+3) Why not make cleancache asynchronous and batched so it can
+   more easily interface with real devices with DMA instead
+   of copying each individual page? (Minchan Kim)
+The one-page-at-a-time copy semantics simplifies the implementation
+on both the frontend and backend and also allows the backend to
+do fancy things on-the-fly like page compression and
+page deduplication.  And since the data is "gone" (copied into/out
+of the pageframe) before the cleancache get/put call returns,
+a great deal of race conditions and potential coherency issues
+are avoided.  While the interface seems odd for a "real device"
+or for real kernel-addressable RAM, it makes perfect sense for
+transcendent memory.
+4) Why is non-shared cleancache "exclusive"?  And where is the
+   page "flushed" after a "get"? (Minchan Kim)
+The main reason is to free up space in transcendent memory and
+to avoid unnecessary cleancache_flush calls.  If you want inclusive,
+the page can be "put" immediately following the "get".  If
+put-after-get for inclusive becomes common, the interface could
+be easily extended to add a "get_no_flush" call.
+The flush is done by the cleancache backend implementation.
+5) What's the performance impact?
+Performance analysis has been presented at OLS'09 and LCA'10.
+Briefly, performance gains can be significant on most workloads,
+especially when memory pressure is high (e.g. when RAM is
+overcommitted in a virtual workload); and because the hooks are
+invoked primarily in place of or in addition to a disk read/write,
+overhead is negligible even in worst case workloads.  Basically
+cleancache replaces I/O with memory-copy-CPU-overhead; on older
+single-core systems with slow memory-copy speeds, cleancache
+has little value, but in newer multicore machines, especially
+consolidated/virtualized machines, it has great value.
+6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+Filesystems that are well-behaved and conform to certain
+restrictions can utilize cleancache simply by making a call to
+cleancache_init_fs at mount time.  Unusual, misbehaving, or
+poorly layered filesystems must either add additional hooks
+and/or undergo extensive additional testing... or should just
+not enable the optional cleancache.
+Some points for a filesystem to consider:
+- The FS should be block-device-based (e.g. a ram-based FS such
+  as tmpfs should not enable cleancache)
+- To ensure coherency/correctness, the FS must ensure that all
+  file removal or truncation operations either go through VFS or
+  add hooks to do the equivalent cleancache "flush" operations
+- To ensure coherency/correctness, either inode numbers must
+  be unique across the lifetime of the on-disk file OR the
+  FS must provide an "encode_fh" function.
+- The FS must call the VFS superblock alloc and deactivate routines
+  or add hooks to do the equivalent cleancache calls done there.
+- To maximize performance, all pages fetched from the FS should
+  go through the do_mpag_readpage routine or the FS should add
+  hooks to do the equivalent (cf. btrfs)
+- Currently, the FS blocksize must be the same as PAGESIZE.  This
+  is not an architectural restriction, but no backends currently
+  support anything different.
+- A clustered FS should invoke the "shared_init_fs" cleancache
+  hook to get best performance for some backends.
+7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+If cleancache would use the inode virtual address instead of
+inode/filehandle, the pool id could be eliminated.  But, this
+won't work because cleancache retains pagecache data pages
+persistently even when the inode has been pruned from the
+inode unused list, and only flushes the data page if the file
+gets removed/truncated.  So if cleancache used the inode kva,
+there would be potential coherency issues if/when the inode
+kva is reused for a different file.  Alternately, if cleancache
+flushed the pages when the inode kva was freed, much of the value
+of cleancache would be lost because the cache of pages in cleanache
+is potentially much larger than the kernel pagecache and is most
+useful if the pages survive inode cache removal.
+8) Why is a global variable required?
+The cleancache_enabled flag is checked in all of the frequently-used
+cleancache hooks.  The alternative is a function call to check a static
+variable. Since cleancache is enabled dynamically at runtime, systems
+that don't enable cleancache would suffer thousands (possibly
+tens-of-thousands) of unnecessary function calls per second.  So the
+global variable allows cleancache to be enabled by default at compile
+time, but have insignificant performance impact when cleancache remains
+disabled at runtime.
+9) Does cleanache work with KVM?
+The memory model of KVM is sufficiently different that a cleancache
+backend may have less value for KVM.  This remains to be tested,
+especially in an overcommitted system.
+10) Does cleancache work in userspace?  It sounds useful for
+   memory hungry caches like web browsers.  (Jamie Lokier)
+No plans yet, though we agree it sounds useful, at least for
+apps that bypass the page cache (e.g. O_DIRECT).
+Last updated: Dan Magenheimer, April 13 2011
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ab17de642e5..d54d551004f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3572,9 +3572,16 @@ M:	Andrew Morton <akpm@linux-foundation.org>
 M:      Jan Kara <jack@suse.cz>
 L:      linux-ext4@vger.kernel.org
 S:      Maintained
-F:      fs/jbd*/
+F:      fs/jbd/
-F:      include/linux/ext*jbd*.h
+F:      include/linux/ext3_jbd.h
-F:      include/linux/jbd*.h
+F:      include/linux/jbd.h
+JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
+M:      "Theodore Ts'o" <tytso@mit.edu>
+L:      linux-ext4@vger.kernel.org
+S:      Maintained
+F:      fs/jbd2/
+F:      include/linux/jbd2.h
 JSM Neo PCI based serial card
 M:      Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe52296..d240ea950519 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
       return _hypercall2(unsigned long, hvm_op, op, arg);
 }
+static inline int
+HYPERVISOR_tmem_op(
+        struct tmem_op *op)
+{
+        return _hypercall1(int, tmem_op, op);
+}
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/drivers/video/mb862xx/mb862xx-i2c.c b/drivers/video/mb862xx/mb862xx-i2c.c
index cb77d3b4657d..b953099edd8e 100644
--- a/drivers/video/mb862xx/mb862xx-i2c.c
+++ b/drivers/video/mb862xx/mb862xx-i2c.c
@@ -12,6 +12,7 @@
 #include <linux/fb.h>
 #include <linux/i2c.h>
 #include <linux/io.h>
+#include <linux/delay.h>
 #include "mb862xxfb.h"
 #include "mb862xx_reg.h"
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 4781f806701d..bbc18258ecc5 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,5 +1,6 @@
 obj-y   += grant-table.o features.o events.o manage.o balloon.o
 obj-y   += xenbus/
+obj-y   += tmem.o
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o                       := $(nostackp)
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
new file mode 100644
index 000000000000..816a44959ef0
--- /dev/null
+++ b/drivers/xen/tmem.c
@@ -0,0 +1,264 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/cleancache.h>
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_VERSION_SHIFT        24
+struct tmem_pool_uuid {
+        u64 uuid_lo;
+        u64 uuid_hi;
+};
+struct tmem_oid {
+        u64 oid[3];
+};
+#define TMEM_POOL_PRIVATE_UUID  { 0, 0 }
+/* flags for tmem_ops.new_pool */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+/* xen tmem foundation ops/hypercalls */
+static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
+        u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+        struct tmem_op op;
+        int rc = 0;
+        op.cmd = tmem_cmd;
+        op.pool_id = tmem_pool;
+        op.u.gen.oid[0] = oid.oid[0];
+        op.u.gen.oid[1] = oid.oid[1];
+        op.u.gen.oid[2] = oid.oid[2];
+        op.u.gen.index = index;
+        op.u.gen.tmem_offset = tmem_offset;
+        op.u.gen.pfn_offset = pfn_offset;
+        op.u.gen.len = len;
+        set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+        rc = HYPERVISOR_tmem_op(&op);
+        return rc;
+}
+static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
+                                u32 flags, unsigned long pagesize)
+{
+        struct tmem_op op;
+        int rc = 0, pageshift;
+        for (pageshift = 0; pagesize != 1; pageshift++)
+                pagesize >>= 1;
+        flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
+        flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+        op.cmd = TMEM_NEW_POOL;
+        op.u.new.uuid[0] = uuid.uuid_lo;
+        op.u.new.uuid[1] = uuid.uuid_hi;
+        op.u.new.flags = flags;
+        rc = HYPERVISOR_tmem_op(&op);
+        return rc;
+}
+/* xen generic tmem ops */
+static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
+                             u32 index, unsigned long pfn)
+{
+        unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+        return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
+                gmfn, 0, 0, 0);
+}
+static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
+                             u32 index, unsigned long pfn)
+{
+        unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
+        return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
+                gmfn, 0, 0, 0);
+}
+static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
+{
+        return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
+                0, 0, 0, 0);
+}
+static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
+{
+        return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
+}
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+        struct tmem_oid oid = { { 0 } };
+        return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+int tmem_enabled;
+static int __init enable_tmem(char *s)
+{
+        tmem_enabled = 1;
+        return 1;
+}
+__setup("tmem", enable_tmem);
+/* cleancache ops */
+static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
+                                     pgoff_t index, struct page *page)
+{
+        u32 ind = (u32) index;
+        struct tmem_oid oid = *(struct tmem_oid *)&key;
+        unsigned long pfn = page_to_pfn(page);
+        if (pool < 0)
+                return;
+        if (ind != index)
+                return;
+        mb(); /* ensure page is quiescent; tmem may address it with an alias */
+        (void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+}
+static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
+                                    pgoff_t index, struct page *page)
+{
+        u32 ind = (u32) index;
+        struct tmem_oid oid = *(struct tmem_oid *)&key;
+        unsigned long pfn = page_to_pfn(page);
+        int ret;
+        /* translate return values to linux semantics */
+        if (pool < 0)
+                return -1;
+        if (ind != index)
+                return -1;
+        ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
+        if (ret == 1)
+                return 0;
+        else
+                return -1;
+}
+static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
+                                       pgoff_t index)
+{
+        u32 ind = (u32) index;
+        struct tmem_oid oid = *(struct tmem_oid *)&key;
+        if (pool < 0)
+                return;
+        if (ind != index)
+                return;
+        (void)xen_tmem_flush_page((u32)pool, oid, ind);
+}
+static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
+{
+        struct tmem_oid oid = *(struct tmem_oid *)&key;
+        if (pool < 0)
+                return;
+        (void)xen_tmem_flush_object((u32)pool, oid);
+}
+static void tmem_cleancache_flush_fs(int pool)
+{
+        if (pool < 0)
+                return;
+        (void)xen_tmem_destroy_pool((u32)pool);
+}
+static int tmem_cleancache_init_fs(size_t pagesize)
+{
+        struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
+        return xen_tmem_new_pool(uuid_private, 0, pagesize);
+}
+static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+        struct tmem_pool_uuid shared_uuid;
+        shared_uuid.uuid_lo = *(u64 *)uuid;
+        shared_uuid.uuid_hi = *(u64 *)(&uuid[8]);
+        return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
+}
+static int use_cleancache = 1;
+static int __init no_cleancache(char *s)
+{
+        use_cleancache = 0;
+        return 1;
+}
+__setup("nocleancache", no_cleancache);
+static struct cleancache_ops tmem_cleancache_ops = {
+        .put_page = tmem_cleancache_put_page,
+        .get_page = tmem_cleancache_get_page,
+        .flush_page = tmem_cleancache_flush_page,
+        .flush_inode = tmem_cleancache_flush_inode,
+        .flush_fs = tmem_cleancache_flush_fs,
+        .init_shared_fs = tmem_cleancache_init_shared_fs,
+        .init_fs = tmem_cleancache_init_fs
+};
+static int __init xen_tmem_init(void)
+{
+        struct cleancache_ops old_ops;
+        if (!xen_domain())
+                return 0;
+#ifdef CONFIG_CLEANCACHE
+        BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+        if (tmem_enabled && use_cleancache) {
+                char *s = "";
+                old_ops = cleancache_register_ops(&tmem_cleancache_ops);
+                if (old_ops.init_fs != NULL)
+                        s = " (WARNING: cleancache_ops overridden)";
+                printk(KERN_INFO "cleancache enabled, RAM provided by "
+                                 "Xen Transcendent Memory%s\n", s);
+        }
+#endif
+        return 0;
+}
+module_init(xen_tmem_init)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
+        dentry_unhash(d);
        return v9fs_remove(i, d, 1);
 }
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
diff --git a/fs/Kconfig b/fs/Kconfig
index 979992dcb386..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        bool
+        tristate
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
@@ -121,6 +121,20 @@ config TMPFS
          See <file:Documentation/filesystems/tmpfs.txt> for details.
+config TMPFS_POSIX_ACL
+        bool "Tmpfs POSIX Access Control Lists"
+        depends on TMPFS
+        select TMPFS_XATTR
+        select GENERIC_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N.
 config TMPFS_XATTR
        bool "Tmpfs extended attributes"
        depends on TMPFS
@@ -133,22 +147,9 @@ config TMPFS_XATTR
          Currently this enables support for the trusted.* and
          security.* namespaces.
-          If unsure, say N.
          You need this for POSIX ACL support on tmpfs.
-config TMPFS_POSIX_ACL
+          If unsure, say N.
-        bool "Tmpfs POSIX Access Control Lists"
-        depends on TMPFS_XATTR
-        select GENERIC_ACL
-        help
-          POSIX Access Control Lists (ACLs) support permissions for users and
-          groups beyond the owner/group/world scheme.
-          To learn more about Access Control Lists, visit the POSIX ACLs for
-          Linux website <http://acl.bestbits.at/>.
-          If you don't know what Access Control Lists are, say N.
 config HUGETLBFS
        bool "HugeTLB file system support"
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
                 dentry->d_inode->i_ino,
                 (int)dentry->d_name.len, dentry->d_name.name);
+        dentry_unhash(dentry);
        return affs_remove_header(dentry);
 }
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh = NULL;
        int retval;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
                 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
                 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
        _enter("{%x:%u},{%s}",
               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        dentry_unhash(dentry);
        ret = -ENAMETOOLONG;
        if (dentry->d_name.len >= AFSNAMEMAX)
                goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct key *key;
        int ret;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        vnode = AFS_FS_I(old_dentry->d_inode);
        orig_dvnode = AFS_FS_I(old_dir);
        new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
                return -EACCES;
+        dentry_unhash(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
                if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct bfs_sb_info *info;
        int error = -ENOENT;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_bh = new_bh = NULL;
        old_inode = old_dentry->d_inode;
        if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96fcfa522dab..4f9893243dae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -11,6 +11,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/prefetch.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
+        if (!PageUptodate(page)) {
+                if (cleancache_get_page(page) == 0) {
+                        BUG_ON(blocksize != PAGE_SIZE);
+                        goto out;
+                }
+        }
        end = page_end;
        while (1) {
                lock_extent(tree, start, end, GFP_NOFS);
@@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                cur = cur + iosize;
                page_offset += iosize;
        }
+out:
        if (!nr) {
                if (!PageError(page))
                        SetPageUptodate(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ac712efcdf2..be4ffa12f3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/cleancache.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_root = root_dentry;
        save_mount_options(sb, data);
+        cleancache_init_fs(sb);
        return 0;
 fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e61c6f..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
        invalidate_bh_lrus();
        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
+        /* 99% of the time, we don't need to flush the cleancache on the bdev.
+         * But, for the strange corners, lets be cautious
+         */
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
 */
-int
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                         get_block_t get_block)
-                   get_block_t get_block)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        int ret;
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
-                /* page got truncated out from underneath us */
+                /* We overload EFAULT to mean page got truncated */
-                unlock_page(page);
+                ret = -EFAULT;
-                goto out;
+                goto out_unlock;
        }
        /* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (!ret)
                ret = block_commit_write(page, 0, end);
-        if (unlikely(ret)) {
+        if (unlikely(ret < 0))
-                unlock_page(page);
+                goto out_unlock;
-                if (ret == -ENOMEM)
+        /*
-                        ret = VM_FAULT_OOM;
+         * Freezing in progress? We check after the page is marked dirty and
-                else /* -ENOSPC, -EIO, etc */
+         * with page lock held so if the test here fails, we are sure freezing
-                        ret = VM_FAULT_SIGBUS;
+         * code will wait during syncing until the page fault is done - at that
-        } else
+         * point page will be dirty and unlocked so freezing code will write it
-                ret = VM_FAULT_LOCKED;
+         * and writeprotect it again.
+         */
-out:
+        set_page_dirty(page);
+        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+                ret = -EAGAIN;
+                goto out_unlock;
+        }
+        return 0;
+out_unlock:
+        unlock_page(page);
        return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                   get_block_t get_block)
+{
+        int ret;
+        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+        /*
+         * This check is racy but catches the common case. The check in
+         * __block_page_mkwrite() is reliable.
+         */
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        ret = __block_page_mkwrite(vma, vmf, get_block);
+        return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 /*
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
+        dentry_unhash(de);
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct module *subsys_owner = NULL, *dead_item_owner = NULL;
        int ret;
+        dentry_unhash(dentry);
        if (dentry->d_parent == configfs_sb->s_root)
                return -EPERM;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..227b409b8406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct dentry *lower_dir_dentry;
        int rc;
+        dentry_unhash(dentry);
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *lower_new_dir_dentry;
        struct dentry *trap = NULL;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
        dget(lower_old_dentry);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0eadc1..aad153ef6b78 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/log2.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
        } else {
                ext3_msg(sb, KERN_INFO, "using internal journal");
        }
+        cleancache_init_fs(sb);
        return res;
 }
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+                mmp.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:                     handle to this transaction
- * @sb:                         super block
- * @block:                      start physcial block to add to the block group
- * @count:                      number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                         ext4_fsblk_t block, unsigned long count)
-{
-        struct buffer_head *bitmap_bh = NULL;
-        struct buffer_head *gd_bh;
-        ext4_group_t block_group;
-        ext4_grpblk_t bit;
-        unsigned int i;
-        struct ext4_group_desc *desc;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        int err = 0, ret, blk_free_count;
-        ext4_grpblk_t blocks_freed;
-        struct ext4_group_info *grp;
-        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-        grp = ext4_get_group_info(sb, block_group);
-        /*
-         * Check to see if we are freeing blocks across a group
-         * boundary.
-         */
-        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                goto error_return;
-        }
-        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-        if (!bitmap_bh)
-                goto error_return;
-        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-        if (!desc)
-                goto error_return;
-        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-            in_range(block + count - 1, ext4_inode_table(sb, desc),
-                     sbi->s_itb_per_group)) {
-                ext4_error(sb, "Adding blocks in system zones - "
-                           "Block = %llu, count = %lu",
-                           block, count);
-                goto error_return;
-        }
-        /*
-         * We are about to add blocks to the bitmap,
-         * so we need undo access.
-         */
-        BUFFER_TRACE(bitmap_bh, "getting undo access");
-        err = ext4_journal_get_undo_access(handle, bitmap_bh);
-        if (err)
-                goto error_return;
-        /*
-         * We are about to modify some metadata.  Call the journal APIs
-         * to unshare ->b_data if a currently-committing transaction is
-         * using it
-         */
-        BUFFER_TRACE(gd_bh, "get_write_access");
-        err = ext4_journal_get_write_access(handle, gd_bh);
-        if (err)
-                goto error_return;
-        /*
-         * make sure we don't allow a parallel init on other groups in the
-         * same buddy cache
-         */
-        down_write(&grp->alloc_sem);
-        for (i = 0, blocks_freed = 0; i < count; i++) {
-                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-                                                bit + i, bitmap_bh->b_data)) {
-                        ext4_error(sb, "bit already cleared for block %llu",
-                                   (ext4_fsblk_t)(block + i));
-                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
-                } else {
-                        blocks_freed++;
-                }
-        }
-        ext4_lock_group(sb, block_group);
-        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-        ext4_free_blks_set(sb, desc, blk_free_count);
-        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        ext4_unlock_group(sb, block_group);
-        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-        if (sbi->s_log_groups_per_flex) {
-                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                atomic_add(blocks_freed,
-                           &sbi->s_flex_groups[flex_group].free_blocks);
-        }
-        /*
-         * request to reload the buddy with the
-         * new bitmap information
-         */
-        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-        grp->bb_free += blocks_freed;
-        up_write(&grp->alloc_sem);
-        /* We dirtied the bitmap block */
-        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-        /* And the group descriptor block */
-        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-        if (!err)
-                err = ret;
-error_return:
-        brelse(bitmap_bh);
-        ext4_std_error(sb, err);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -493,7 +369,8 @@ error_return:
 * Check if filesystem has nblocks free & available for allocation.
 * On success return 1, return 0 on failure.
 */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                s64 nblocks, unsigned int flags)
 {
        s64 free_blocks, dirty_blocks, root_blocks;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                                                EXT4_FREEBLOCKS_WATERMARK) {
                free_blocks  = percpu_counter_sum_positive(fbc);
                dirty_blocks = percpu_counter_sum_positive(dbc);
-                if (dirty_blocks < 0) {
-                        printk(KERN_CRIT "Dirty block accounting "
-                                        "went wrong %lld\n",
-                                        (long long)dirty_blocks);
-                }
        }
        /* Check whether we have space after
         * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
        /* Hm, nope.  Are (enough) root reserved blocks available? */
        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-            capable(CAP_SYS_RESOURCE)) {
+            capable(CAP_SYS_RESOURCE) ||
+                (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
                if (free_blocks >= (nblocks + dirty_blocks))
                        return 1;
        }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 }
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                                                s64 nblocks)
+                           s64 nblocks, unsigned int flags)
 {
-        if (ext4_has_free_blocks(sbi, nblocks)) {
+        if (ext4_has_free_blocks(sbi, nblocks, flags)) {
                percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
                return 0;
        } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-        if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+        if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
            (*retries)++ > 3 ||
            !EXT4_SB(sb)->s_journal)
                return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 * error stores in errp pointer
 */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t goal, unsigned long *count, int *errp)
+                                  ext4_fsblk_t goal, unsigned int flags,
+                                  unsigned long *count, int *errp)
 {
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
+        ar.flags = flags;
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_DELALLOC_RESERVED       0x0400
 /* We are doing stream allocation */
 #define EXT4_MB_STREAM_ALLOC            0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS         0x1000
 struct ext4_allocation_request {
        /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
 */
 #define EXT4_BAD_INO             1      /* Bad blocks inode */
 #define EXT4_ROOT_INO            2      /* Root inode */
+#define EXT4_USR_QUOTA_INO       3      /* User quota inode */
+#define EXT4_GRP_QUOTA_INO       4      /* Group quota inode */
 #define EXT4_BOOT_LOADER_INO     5      /* Boot loader inode */
 #define EXT4_UNDEL_DIR_INO       6      /* Undelete directory inode */
 #define EXT4_RESIZE_INO          7      /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
        /* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT          (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+        /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT           0x0020
+        /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE            0x0040
 /*
 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
        __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
        __le32  s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;          /* RAID stride */
-        __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
 #endif
+        /* ext4 extent cache stats */
+        unsigned long extent_cache_hits;
+        unsigned long extent_cache_misses;
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;
+        /* Kernel thread for multiple mount protection */
+        struct task_struct *s_mmp_tsk;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM         0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE      0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA            0x0100
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION       0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE          0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
+#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
+                                         EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 #define EXT4_FEATURE_COMPAT_SUPP        EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
-                                         EXT4_FEATURE_INCOMPAT_FLEX_BG)
+                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                         EXT4_FEATURE_INCOMPAT_MMP)
 #define EXT4_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 */
 struct ext4_lazy_init {
        unsigned long           li_state;
-        wait_queue_head_t       li_wait_daemon;
-        wait_queue_head_t       li_wait_task;
-        struct timer_list       li_timer;
-        struct task_struct      *li_task;
        struct list_head        li_request_list;
        struct mutex            li_list_mtx;
 };
@@ -1615,6 +1639,67 @@ struct ext4_features {
 };
 /*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+struct mmp_struct {
+        __le32  mmp_magic;              /* Magic number for MMP */
+        __le32  mmp_seq;                /* Sequence no. updated periodically */
+        /*
+         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+         * purposes and do not affect the correctness of the algorithm
+         */
+        __le64  mmp_time;               /* Time last updated */
+        char    mmp_nodename[64];       /* Node which last updated MMP block */
+        char    mmp_bdevname[32];       /* Bdev which last updated MMP block */
+        /*
+         * mmp_check_interval is used to verify if the MMP block has been
+         * updated on the block device. The value is updated based on the
+         * maximum time to write the MMP block during an update cycle.
+         */
+        __le16  mmp_check_interval;
+        __le16  mmp_pad1;
+        __le32  mmp_pad2[227];
+};
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+        struct buffer_head *bh; /* bh from initial read_mmp_block() */
+        struct super_block *sb;  /* super block of the fs */
+};
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT             2UL
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL     5UL
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL     300UL
+/*
 * Function prototypes
 */
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, unsigned long *count, int *errp);
+                                         ext4_fsblk_t goal,
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+                                         unsigned int flags,
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                                         unsigned long *count,
-                                ext4_fsblk_t block, unsigned long count);
+                                         int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+                                  s64 nblocks, unsigned int flags);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                                ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                                                       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+                           const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)      __dump_mmp_msg(sb, mmp, __func__, \
+                                                       __LINE__, msg)
 extern void __ext4_grp_locked_error(const char *, unsigned int, \
                                    struct super_block *, ext4_group_t, \
                                    unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+                                loff_t length);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               int len,
                               struct writeback_control *wbc);
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
        BH_Uninit       /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
 #include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh)
-{
-        int err = 0;
-        if (ext4_handle_valid(handle)) {
-                err = jbd2_journal_get_undo_access(handle, bh);
-                if (err)
-                        ext4_journal_abort_handle(where, line, __func__, bh,
-                                                  handle, err);
-        }
-        return err;
-}
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh)
 {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
                               const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                   handle_t *handle, struct buffer_head *bh);
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
 #include <trace/events/ext4.h>
+static int ext4_split_extent(handle_t *handle,
+                                struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_map_blocks *map,
+                                int split_flag,
+                                int flags);
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 static ext4_fsblk_t
 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
-                        struct ext4_extent *ex, int *err)
+                        struct ext4_extent *ex, int *err, unsigned int flags)
 {
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                        NULL, err);
        return newblock;
 }
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        }
        ext_debug("\n");
 }
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+                        ext4_fsblk_t newblock, int level)
+{
+        int depth = ext_depth(inode);
+        struct ext4_extent *ex;
+        if (depth != level) {
+                struct ext4_extent_idx *idx;
+                idx = path[level].p_idx;
+                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+                        ext_debug("%d: move %d:%llu in new index %llu\n", level,
+                                        le32_to_cpu(idx->ei_block),
+                                        ext4_idx_pblock(idx),
+                                        newblock);
+                        idx++;
+                }
+                return;
+        }
+        ex = path[depth].p_ext;
+        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+                                le32_to_cpu(ex->ee_block),
+                                ext4_ext_pblock(ex),
+                                ext4_ext_is_uninitialized(ex),
+                                ext4_ext_get_actual_len(ex),
+                                newblock);
+                ex++;
+        }
+}
 #else
 #define ext4_ext_show_path(inode, path)
 #define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 * - initializes subtree
 */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-                                struct ext4_ext_path *path,
+                          unsigned int flags,
-                                struct ext4_extent *newext, int at)
+                          struct ext4_ext_path *path,
+                          struct ext4_extent *newext, int at)
 {
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
-        struct ext4_extent *ex;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
-                                                   newext, &err);
+                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
-        ex = EXT_FIRST_EXTENT(neh);
        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                goto cleanup;
        }
        /* start copy from next extent */
-        /* TODO: we could do it by single memmove */
+        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
-        m = 0;
+        ext4_ext_show_move(inode, path, newblock, depth);
-        path[depth].p_ext++;
-        while (path[depth].p_ext <=
-                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
-                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-                                le32_to_cpu(path[depth].p_ext->ee_block),
-                                ext4_ext_pblock(path[depth].p_ext),
-                                ext4_ext_is_uninitialized(path[depth].p_ext),
-                                ext4_ext_get_actual_len(path[depth].p_ext),
-                                newblock);
-                /*memmove(ex++, path[depth].p_ext++,
-                                sizeof(struct ext4_extent));
-                neh->eh_entries++;*/
-                path[depth].p_ext++;
-                m++;
-        }
        if (m) {
-                memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+                struct ext4_extent *ex;
+                ex = EXT_FIRST_EXTENT(neh);
+                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                ext_debug("int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);
-                /* copy indexes */
-                m = 0;
-                path[i].p_idx++;
-                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+                /* move remainder of path[i] to the new index block */
-                                EXT_MAX_INDEX(path[i].p_hdr));
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                        err = -EIO;
                        goto cleanup;
                }
-                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+                /* start copy indexes */
-                        ext_debug("%d: move %d:%llu in new index %llu\n", i,
+                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
-                                        le32_to_cpu(path[i].p_idx->ei_block),
+                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-                                        ext4_idx_pblock(path[i].p_idx),
+                                EXT_MAX_INDEX(path[i].p_hdr));
-                                        newblock);
+                ext4_ext_show_move(inode, path, newblock, i);
-                        /*memmove(++fidx, path[i].p_idx++,
-                                        sizeof(struct ext4_extent_idx));
-                        neh->eh_entries++;
-                        BUG_ON(neh->eh_entries > neh->eh_max);*/
-                        path[i].p_idx++;
-                        m++;
-                }
                if (m) {
-                        memmove(++fidx, path[i].p_idx - m,
+                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
@@ -1056,8 +1073,9 @@ cleanup:
 *   just created block
 */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                 unsigned int flags,
-                                        struct ext4_extent *newext)
+                                 struct ext4_ext_path *path,
+                                 struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock;
        int err = 0;
-        newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+        newblock = ext4_ext_new_meta_block(handle, inode, path,
+                newext, &err, flags);
        if (newblock == 0)
                return err;
@@ -1140,8 +1159,9 @@ out:
 * if no free index is found, then it requests in-depth growing.
 */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-                                        struct ext4_ext_path *path,
+                                    unsigned int flags,
-                                        struct ext4_extent *newext)
+                                    struct ext4_ext_path *path,
+                                    struct ext4_extent *newext)
 {
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
-                err = ext4_ext_split(handle, inode, path, newext, i);
+                err = ext4_ext_split(handle, inode, flags, path, newext, i);
                if (err)
                        goto out;
@@ -1174,7 +1194,8 @@ repeat:
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-                err = ext4_ext_grow_indepth(handle, inode, path, newext);
+                err = ext4_ext_grow_indepth(handle, inode, flags,
+                                            path, newext);
                if (err)
                        goto out;
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *ex)
 {
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
 }
 /*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+                                  struct ext4_ext_path *path,
+                                  struct ext4_extent *ex) {
+        struct ext4_extent_header *eh;
+        unsigned int depth;
+        int merge_done = 0;
+        int ret = 0;
+        depth = ext_depth(inode);
+        BUG_ON(path[depth].p_hdr == NULL);
+        eh = path[depth].p_hdr;
+        if (ex > EXT_FIRST_EXTENT(eh))
+                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+        if (!merge_done)
+                ret = ext4_ext_try_to_merge_right(inode, path, ex);
+        return ret;
+}
+/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        int depth, len, err;
        ext4_lblk_t next;
        unsigned uninitialized = 0;
+        int flags = 0;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
-        err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+        if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+                flags = EXT4_MB_USE_ROOT_BLOCKS;
+        err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
 * Return 0 if cache is invalid; 1 if the cache is valid
 */
-static int
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+        struct ext4_ext_cache *ex){
-                        struct ext4_extent *ex)
-{
        struct ext4_ext_cache *cex;
+        struct ext4_sb_info *sbi;
        int ret = 0;
        /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
+        sbi = EXT4_SB(inode->i_sb);
        /* has cache valid data? */
        if (cex->ec_len == 0)
                goto errout;
        if (in_range(block, cex->ec_block, cex->ec_len)) {
-                ex->ee_block = cpu_to_le32(cex->ec_block);
+                memcpy(ex, cex, sizeof(struct ext4_ext_cache));
-                ext4_ext_store_pblock(ex, cex->ec_start);
-                ex->ee_len = cpu_to_le16(cex->ec_len);
                ext_debug("%u cached by %u:%u:%llu\n",
                                block,
                                cex->ec_block, cex->ec_len, cex->ec_start);
                ret = 1;
        }
 errout:
+        if (!ret)
+                sbi->extent_cache_misses++;
+        else
+                sbi->extent_cache_hits++;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return ret;
 }
 /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+                        struct ext4_extent *ex)
+{
+        struct ext4_ext_cache cex;
+        int ret = 0;
+        if (ext4_ext_check_cache(inode, block, &cex)) {
+                ex->ee_block = cpu_to_le32(cex.ec_block);
+                ext4_ext_store_pblock(ex, cex.ec_start);
+                ex->ee_len = cpu_to_le16(cex.ec_len);
+                ret = 1;
+        }
+        return ret;
+}
+/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
+                /* head removal */
-                        from, to, le32_to_cpu(ex->ee_block), ee_len);
+                ext4_lblk_t num;
+                ext4_fsblk_t start;
+                num = to - from;
+                start = ext4_ext_pblock(ex);
+                ext_debug("free first %u blocks starting %llu\n", num, start);
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
        } else {
                printk(KERN_INFO "strange request: removal(2) "
                                "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
        return 0;
 }
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, ext4_lblk_t start)
+                struct ext4_ext_path *path, ext4_lblk_t start,
+                ext4_lblk_t end)
 {
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
+        struct ext4_map_blocks map;
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                path[depth].p_ext = ex;
                a = ex_ee_block > start ? ex_ee_block : start;
-                b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+                b = ex_ee_block+ex_ee_len - 1 < end ?
-                        ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+                        ex_ee_block+ex_ee_len - 1 : end;
                ext_debug("  border %u:%u\n", a, b);
-                if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+                /* If this extent is beyond the end of the hole, skip it */
-                        block = 0;
+                if (end <= ex_ee_block) {
-                        num = 0;
+                        ex--;
-                        BUG();
+                        ex_ee_block = le32_to_cpu(ex->ee_block);
+                        ex_ee_len = ext4_ext_get_actual_len(ex);
+                        continue;
+                } else if (a != ex_ee_block &&
+                        b != ex_ee_block + ex_ee_len - 1) {
+                        /*
+                         * If this is a truncate, then this condition should
+                         * never happen because at least one of the end points
+                         * needs to be on the edge of the extent.
+                         */
+                        if (end == EXT_MAX_BLOCK) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                                start, end);
+                                block = 0;
+                                num = 0;
+                                err = -EIO;
+                                goto out;
+                        }
+                        /*
+                         * else this is a hole punch, so the extent needs to
+                         * be split since neither edge of the hole is on the
+                         * extent edge
+                         */
+                        else{
+                                map.m_pblk = ext4_ext_pblock(ex);
+                                map.m_lblk = ex_ee_block;
+                                map.m_len = b - ex_ee_block;
+                                err = ext4_split_extent(handle,
+                                        inode, path, &map, 0,
+                                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                        EXT4_GET_BLOCKS_PRE_IO);
+                                if (err < 0)
+                                        goto out;
+                                ex_ee_len = ext4_ext_get_actual_len(ex);
+                                b = ex_ee_block+ex_ee_len - 1 < end ?
+                                        ex_ee_block+ex_ee_len - 1 : end;
+                                /* Then remove tail of this extent */
+                                block = ex_ee_block;
+                                num = a - block;
+                        }
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        block = ex_ee_block;
                        num = a - block;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        /* remove head of the extent */
-                        block = a;
+                        block = b;
-                        num = b - a;
+                        num =  ex_ee_block + ex_ee_len - b;
-                        /* there is no "make a hole" API yet */
-                        BUG();
+                        /*
+                         * If this is a truncate, this condition
+                         * should never happen
+                         */
+                        if (end == EXT_MAX_BLOCK) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                } else {
                        /* remove whole extent: excellent! */
                        block = ex_ee_block;
                        num = 0;
-                        BUG_ON(a != ex_ee_block);
+                        if (a != ex_ee_block) {
-                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
+                        if (b != ex_ee_block + ex_ee_len - 1) {
+                                ext_debug("  bad truncate %u:%u\n",
+                                        start, end);
+                                err = -EIO;
+                                goto out;
+                        }
                }
                /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (num == 0) {
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-                        le16_add_cpu(&eh->eh_entries, -1);
+                } else if (block != ex_ee_block) {
+                        /*
+                         * If this was a head removal, then we need to update
+                         * the physical block since it is now at a different
+                         * location
+                         */
+                        ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
                }
                ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
+                /*
+                 * If the extent was completely released,
+                 * we need to remove it from the leaf
+                 */
+                if (num == 0) {
+                        if (end != EXT_MAX_BLOCK) {
+                                /*
+                                 * For hole punching, we need to scoot all the
+                                 * extents up when an extent is removed so that
+                                 * we dont have blank extents in the middle
+                                 */
+                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+                                        sizeof(struct ext4_extent));
+                                /* Now get rid of the one at the end */
+                                memset(EXT_LAST_EXTENT(eh), 0,
+                                        sizeof(struct ext4_extent));
+                        }
+                        le16_add_cpu(&eh->eh_entries, -1);
+                }
                ext_debug("new extent: %u:%u:%llu\n", block, num,
                                ext4_ext_pblock(ex));
                ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                                ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
-                        err = ext4_ext_rm_leaf(handle, inode, path, start);
+                        err = ext4_ext_rm_leaf(handle, inode, path,
+                                        start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
        return ret;
 }
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
+                                        due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1   0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2   0x4  /* mark second half uninitialized */
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *               the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             ext4_lblk_t split,
+                             int split_flag,
+                             int flags)
+{
+        ext4_fsblk_t newblock;
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_extent *ex2 = NULL;
+        unsigned int ee_len, depth;
+        int err = 0;
+        ext_debug("ext4_split_extents_at: inode %lu, logical"
+                "block %llu\n", inode->i_ino, (unsigned long long)split);
+        ext4_ext_show_leaf(inode, path);
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        newblock = split - ee_block + ext4_ext_pblock(ex);
+        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        if (split == ee_block) {
+                /*
+                 * case b: block @split is the block that the extent begins with
+                 * then we just change the state of the extent, and splitting
+                 * is not needed.
+                 */
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        ext4_ext_mark_uninitialized(ex);
+                else
+                        ext4_ext_mark_initialized(ex);
+                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+                        ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                goto out;
+        }
+        /* case a */
+        memcpy(&orig_ex, ex, sizeof(orig_ex));
+        ex->ee_len = cpu_to_le16(split - ee_block);
+        if (split_flag & EXT4_EXT_MARK_UNINIT1)
+                ext4_ext_mark_uninitialized(ex);
+        /*
+         * path may lead to new leaf, not to original leaf any more
+         * after ext4_ext_insert_extent() returns,
+         */
+        err = ext4_ext_dirty(handle, inode, path + depth);
+        if (err)
+                goto fix_extent_len;
+        ex2 = &newex;
+        ex2->ee_block = cpu_to_le32(split);
+        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+        ext4_ext_store_pblock(ex2, newblock);
+        if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                ext4_ext_mark_uninitialized(ex2);
+        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, &orig_ex);
+                if (err)
+                        goto fix_extent_len;
+                /* update the extent length and mark as initialized */
+                ex->ee_len = cpu_to_le32(ee_len);
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                goto out;
+        } else if (err)
+                goto fix_extent_len;
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+fix_extent_len:
+        ex->ee_len = orig_ex.ee_len;
+        ext4_ext_dirty(handle, inode, path + depth);
+        return err;
+}
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+                              struct inode *inode,
+                              struct ext4_ext_path *path,
+                              struct ext4_map_blocks *map,
+                              int split_flag,
+                              int flags)
+{
+        ext4_lblk_t ee_block;
+        struct ext4_extent *ex;
+        unsigned int ee_len, depth;
+        int err = 0;
+        int uninitialized;
+        int split_flag1, flags1;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        uninitialized = ext4_ext_is_uninitialized(ex);
+        if (map->m_lblk + map->m_len < ee_block + ee_len) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+                                       EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk + map->m_len, split_flag1, flags1);
+                if (err)
+                        goto out;
+        }
+        ext4_ext_drop_refs(path);
+        path = ext4_ext_find_extent(inode, map->m_lblk, path);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        if (map->m_lblk >= ee_block) {
+                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                if (uninitialized)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+                if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                        split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+                err = ext4_split_extent_at(handle, inode, path,
+                                map->m_lblk, split_flag1, flags);
+                if (err)
+                        goto out;
+        }
+        ext4_ext_show_leaf(inode, path);
+out:
+        return err ? err : map->m_len;
+}
 #define EXT4_EXT_ZERO_LEN 7
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        struct ext4_map_blocks split_map;
-        struct ext4_extent *ex1 = NULL;
+        struct ext4_extent zero_ex;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
-        struct ext4_extent_header *eh;
        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
        int err = 0;
-        int ret = 0;
+        int split_flag = 0;
-        int may_zeroout;
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-                err =  ext4_ext_zeroout(inode, &orig_ex);
+            (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                err = ext4_ext_zeroout(inode, ex);
                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zeroed the full extent */
-                return allocated;
-        }
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
-                        /*
-                         * map->m_lblk == ee_block is handled by the zerouout
-                         * at the beginning.
-                         * Mark first half uninitialized.
-                         * Mark second half initialized and zero out the
-                         * initialized extent
-                         */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = cpu_to_le16(ee_len - allocated);
-                        ext4_ext_mark_uninitialized(ex);
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(map->m_lblk);
-                        ext4_ext_store_pblock(ex3, newblock);
-                        ex3->ee_len = cpu_to_le16(allocated);
-                        err = ext4_ext_insert_extent(handle, inode, path,
-                                                        ex3, 0);
-                        if (err == -ENOSPC) {
-                                err =  ext4_ext_zeroout(inode, &orig_ex);
-                                if (err)
-                                        goto fix_extent_len;
-                                ex->ee_block = orig_ex.ee_block;
-                                ex->ee_len   = orig_ex.ee_len;
-                                ext4_ext_store_pblock(ex,
-                                        ext4_ext_pblock(&orig_ex));
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from map->m_lblk */
-                                return allocated;
-                        } else if (err)
-                                goto fix_extent_len;
-                        /*
-                         * We need to zero out the second half because
-                         * an fallocate request can update file size and
-                         * converting the second half to initialized extent
-                         * implies that we can leak some junk data to user
-                         * space.
-                         */
-                        err =  ext4_ext_zeroout(inode, ex3);
-                        if (err) {
-                                /*
-                                 * We should actually mark the
-                                 * second half as uninit and return error
-                                 * Insert would have changed the extent
-                                 */
-                                depth = ext_depth(inode);
-                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                            path);
-                                if (IS_ERR(path)) {
-                                        err = PTR_ERR(path);
-                                        return err;
-                                }
-                                /* get the second half extent details */
-                                ex = path[depth].p_ext;
-                                err = ext4_ext_get_access(handle, inode,
-                                                                path + depth);
-                                if (err)
-                                        return err;
-                                ext4_ext_mark_uninitialized(ex);
-                                ext4_ext_dirty(handle, inode, path + depth);
-                                return err;
-                        }
-                        /* zeroed the second half */
-                        return allocated;
-                }
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
                        goto out;
-                }
-                eh = path[depth].p_hdr;
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;
+                ext4_ext_mark_initialized(ex);
-                allocated = map->m_len;
+                ext4_ext_try_to_merge(inode, path, ex);
+                err = ext4_ext_dirty(handle, inode, path + depth);
-                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
+                goto out;
-                 * to insert a extent in the middle zerout directly
-                 * otherwise give the extent a chance to merge to left
-                 */
-                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                        map->m_lblk != ee_block && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zero out the first half */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                }
-        }
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        if (ex2 != ex)
-                goto insert;
-        /*
-         * New (initialized) extent starts from the first block
-         * in the current extent. i.e., ex2 == ex
-         * We have to see if it can be merged with the extent
-         * on the left.
-         */
-        if (ex2 > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex2 - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex2--;
-                }
        }
        /*
-         * Try to Merge towards right. This might be required
+         * four cases:
-         * only when the whole extent is being written to.
+         * 1. split the extent into three extents.
-         * i.e. ex2 == ex and ex3 == NULL.
+         * 2. split the extent into two extents, zeroout the first half.
+         * 3. split the extent into two extents, zeroout the second half.
+         * 4. split the extent into two extents with out zeroout.
         */
-        if (!ex3) {
+        split_map.m_lblk = map->m_lblk;
-                ret = ext4_ext_try_to_merge(inode, path, ex2);
+        split_map.m_len = map->m_len;
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
+        if (allocated > map->m_len) {
+                if (allocated <= EXT4_EXT_ZERO_LEN &&
+                    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 3 */
+                        zero_ex.ee_block =
+                                         cpu_to_le32(map->m_lblk);
+                        zero_ex.ee_len = cpu_to_le16(allocated);
+                        ext4_ext_store_pblock(&zero_ex,
+                                ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+                        err = ext4_ext_zeroout(inode, &zero_ex);
                        if (err)
                                goto out;
+                        split_map.m_lblk = map->m_lblk;
+                        split_map.m_len = allocated;
+                } else if ((map->m_lblk - ee_block + map->m_len <
+                           EXT4_EXT_ZERO_LEN) &&
+                           (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                        /* case 2 */
+                        if (map->m_lblk != ee_block) {
+                                zero_ex.ee_block = ex->ee_block;
+                                zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+                                                        ee_block);
+                                ext4_ext_store_pblock(&zero_ex,
+                                                      ext4_ext_pblock(ex));
+                                err = ext4_ext_zeroout(inode, &zero_ex);
+                                if (err)
+                                        goto out;
+                        }
+                        split_map.m_lblk = ee_block;
+                        split_map.m_len = map->m_lblk - ee_block + map->m_len;
+                        allocated = map->m_len;
                }
        }
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
+        allocated = ext4_split_extent(handle, inode, path,
-        goto out;
+                                       &split_map, split_flag, 0);
-insert:
+        if (allocated < 0)
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+                err = allocated;
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
 out:
-        ext4_ext_show_leaf(inode, path);
        return err ? err : allocated;
-fix_extent_len:
-        ex->ee_block = orig_ex.ee_block;
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 /*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct ext4_ext_path *path,
                                        int flags)
 {
-        struct ext4_extent *ex, newex, orig_ex;
+        ext4_lblk_t eof_block;
-        struct ext4_extent *ex1 = NULL;
+        ext4_lblk_t ee_block;
-        struct ext4_extent *ex2 = NULL;
+        struct ext4_extent *ex;
-        struct ext4_extent *ex3 = NULL;
+        unsigned int ee_len;
-        ext4_lblk_t ee_block, eof_block;
+        int split_flag = 0, depth;
-        unsigned int allocated, ee_len, depth;
-        ext4_fsblk_t newblock;
-        int err = 0;
-        int may_zeroout;
        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
-        depth = ext_depth(inode);
-        ex = path[depth].p_ext;
-        ee_block = le32_to_cpu(ex->ee_block);
-        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-        ex2 = ex;
-        orig_ex.ee_block = ex->ee_block;
-        orig_ex.ee_len   = cpu_to_le16(ee_len);
-        ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully insde i_size or new_size.
         */
-        may_zeroout = ee_block + ee_len <= eof_block;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
-        /*
+        ee_block = le32_to_cpu(ex->ee_block);
-         * If the uninitialized extent begins at the same logical
+        ee_len = ext4_ext_get_actual_len(ex);
-         * block where the write begins, and the write completely
-         * covers the extent, then we don't need to split it.
-         */
-        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
-                return allocated;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
-        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (map->m_lblk > ee_block) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * for sanity, update the length of the ex2 extent before
-         * we insert ex3, if ex1 is NULL. This is to avoid temporary
-         * overlap of blocks.
-         */
-        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(map->m_len);
-        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > map->m_len) {
-                unsigned int newdepth;
-                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-                ext4_ext_mark_uninitialized(ex3);
-                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC && may_zeroout) {
-                        err =  ext4_ext_zeroout(inode, &orig_ex);
-                        if (err)
-                                goto fix_extent_len;
-                        /* update the extent length and mark as initialized */
-                        ex->ee_block = orig_ex.ee_block;
-                        ex->ee_len   = orig_ex.ee_len;
-                        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                        ext4_ext_dirty(handle, inode, path + depth);
-                        /* zeroed the full extent */
-                        /* blocks available from map->m_lblk */
-                        return allocated;
-                } else if (err)
-                        goto fix_extent_len;
-                /*
-                 * The depth, and hence eh & ex might change
-                 * as part of the insert above.
-                 */
-                newdepth = ext_depth(inode);
-                /*
-                 * update the extent length after successful insert of the
-                 * split extent
-                 */
-                ee_len -= ext4_ext_get_actual_len(ex3);
-                orig_ex.ee_len = cpu_to_le16(ee_len);
-                may_zeroout = ee_block + ee_len <= eof_block;
-                depth = newdepth;
-                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, map->m_lblk, path);
-                if (IS_ERR(path)) {
-                        err = PTR_ERR(path);
-                        goto out;
-                }
-                ex = path[depth].p_ext;
-                if (ex2 != &newex)
-                        ex2 = ex;
-                err = ext4_ext_get_access(handle, inode, path + depth);
+        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-                if (err)
+        split_flag |= EXT4_EXT_MARK_UNINIT2;
-                        goto out;
-                allocated = map->m_len;
+        flags |= EXT4_GET_BLOCKS_PRE_IO;
-        }
+        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
-        /*
-         * If there was a change of depth as part of the
-         * insertion of ex3 above, we need to update the length
-         * of the ex1 extent again here
-         */
-        if (ex1 && ex1 != ex) {
-                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-                ext4_ext_mark_uninitialized(ex1);
-                ex2 = &newex;
-        }
-        /*
-         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * using direct I/O, uninitialised still.
-         */
-        ex2->ee_block = cpu_to_le32(map->m_lblk);
-        ext4_ext_store_pblock(ex2, newblock);
-        ex2->ee_len = cpu_to_le16(allocated);
-        ext4_ext_mark_uninitialized(ex2);
-        if (ex2 != ex)
-                goto insert;
-        /* Mark modified extent as dirty */
-        err = ext4_ext_dirty(handle, inode, path + depth);
-        ext_debug("out here\n");
-        goto out;
-insert:
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC && may_zeroout) {
-                err =  ext4_ext_zeroout(inode, &orig_ex);
-                if (err)
-                        goto fix_extent_len;
-                /* update the extent length and mark as initialized */
-                ex->ee_block = orig_ex.ee_block;
-                ex->ee_len   = orig_ex.ee_len;
-                ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                ext4_ext_dirty(handle, inode, path + depth);
-                /* zero out the first half */
-                return allocated;
-        } else if (err)
-                goto fix_extent_len;
-out:
-        ext4_ext_show_leaf(inode, path);
-        return err ? err : allocated;
-fix_extent_len:
-        ex->ee_block = orig_ex.ee_block;
-        ex->ee_len   = orig_ex.ee_len;
-        ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-        ext4_ext_mark_uninitialized(ex);
-        ext4_ext_dirty(handle, inode, path + depth);
-        return err;
 }
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
        struct ext4_extent_header *eh;
        int depth;
        int err = 0;
-        int ret = 0;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
+        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)le32_to_cpu(ex->ee_block),
+                ext4_ext_get_actual_len(ex));
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* first mark the extent as initialized */
        ext4_ext_mark_initialized(ex);
-        /*
+        /* note: ext4_ext_correct_indexes() isn't needed here because
-         * We have to see if it can be merged with the extent
+         * borders are not changed
-         * on the left.
-         */
-        if (ex > EXT_FIRST_EXTENT(eh)) {
-                /*
-                 * To merge left, pass "ex - 1" to try_to_merge(),
-                 * since it merges towards right _only_.
-                 */
-                ret = ext4_ext_try_to_merge(inode, path, ex - 1);
-                if (ret) {
-                        err = ext4_ext_correct_indexes(handle, inode, path);
-                        if (err)
-                                goto out;
-                        depth = ext_depth(inode);
-                        ex--;
-                }
-        }
-        /*
-         * Try to Merge towards right.
         */
-        ret = ext4_ext_try_to_merge(inode, path, ex);
+        ext4_ext_try_to_merge(inode, path, ex);
-        if (ret) {
-                err = ext4_ext_correct_indexes(handle, inode, path);
-                if (err)
-                        goto out;
-                depth = ext_depth(inode);
-        }
        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + depth);
 out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock = 0;
        int err = 0, depth, ret;
        unsigned int allocated = 0;
+        unsigned int punched_out = 0;
+        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+        struct ext4_map_blocks punch_map;
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+        if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+                ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        /* Do not put uninitialized extent in the cache */
+                        if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-                        if (!ext4_ext_is_uninitialized(ex)) {
+                                /*
-                                ext4_ext_put_in_cache(inode, ee_block,
+                                 * Do not put uninitialized extent
-                                                        ee_len, ee_start);
+                                 * in the cache
-                                goto out;
+                                 */
+                                if (!ext4_ext_is_uninitialized(ex)) {
+                                        ext4_ext_put_in_cache(inode, ee_block,
+                                                ee_len, ee_start);
+                                        goto out;
+                                }
+                                ret = ext4_ext_handle_uninitialized_extents(
+                                        handle, inode, map, path, flags,
+                                        allocated, newblock);
+                                return ret;
                        }
-                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, map, path, flags, allocated,
+                        /*
-                                        newblock);
+                         * Punch out the map length, but only to the
-                        return ret;
+                         * end of the extent
+                         */
+                        punched_out = allocated < map->m_len ?
+                                allocated : map->m_len;
+                        /*
+                         * Sense extents need to be converted to
+                         * uninitialized, they must fit in an
+                         * uninitialized extent
+                         */
+                        if (punched_out > EXT_UNINIT_MAX_LEN)
+                                punched_out = EXT_UNINIT_MAX_LEN;
+                        punch_map.m_lblk = map->m_lblk;
+                        punch_map.m_pblk = newblock;
+                        punch_map.m_len = punched_out;
+                        punch_map.m_flags = 0;
+                        /* Check to see if the extent needs to be split */
+                        if (punch_map.m_len != ee_len ||
+                                punch_map.m_lblk != ee_block) {
+                                ret = ext4_split_extent(handle, inode,
+                                path, &punch_map, 0,
+                                EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                EXT4_GET_BLOCKS_PRE_IO);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out2;
+                                }
+                                /*
+                                 * find extent for the block at
+                                 * the start of the hole
+                                 */
+                                ext4_ext_drop_refs(path);
+                                kfree(path);
+                                path = ext4_ext_find_extent(inode,
+                                map->m_lblk, NULL);
+                                if (IS_ERR(path)) {
+                                        err = PTR_ERR(path);
+                                        path = NULL;
+                                        goto out2;
+                                }
+                                depth = ext_depth(inode);
+                                ex = path[depth].p_ext;
+                                ee_len = ext4_ext_get_actual_len(ex);
+                                ee_block = le32_to_cpu(ex->ee_block);
+                                ee_start = ext4_ext_pblock(ex);
+                        }
+                        ext4_ext_mark_uninitialized(ex);
+                        err = ext4_ext_remove_space(inode, map->m_lblk,
+                                map->m_lblk + punched_out);
+                        goto out2;
                }
        }
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
+        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@ -3529,7 +3647,11 @@ out2:
        }
        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
                newblock, map->m_len, err ? err : allocated);
-        return err ? err : allocated;
+        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+                        punched_out : allocated;
+        return err ? err : result;
 }
 void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block);
+        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
-out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
+out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        /* We only support the FALLOC_FL_KEEP_SIZE mode */
-        if (mode & ~FALLOC_FL_KEEP_SIZE)
-                return -EOPNOTSUPP;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        /* Return error if mode is not supported */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_PUNCH_HOLE)
+                return ext4_punch_hole(file, offset, len);
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
@@ -3691,7 +3817,8 @@ retry:
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map,
-                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                                      EXT4_GET_BLOCKS_NO_NORMALIZE);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                pgoff_t         last_offset;
                pgoff_t         offset;
                pgoff_t         index;
+                pgoff_t         start_index = 0;
                struct page     **pages = NULL;
                struct buffer_head *bh = NULL;
                struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
                                kfree(pages);
                                return EXT_CONTINUE;
                        }
+                        index = 0;
+next_page:
                        /* Try to find the 1st mapped buffer. */
-                        end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+                        end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
                                  blksize_bits;
-                        if (!page_has_buffers(pages[0]))
+                        if (!page_has_buffers(pages[index]))
                                goto out;
-                        head = page_buffers(pages[0]);
+                        head = page_buffers(pages[index]);
                        if (!head)
                                goto out;
+                        index++;
                        bh = head;
                        do {
-                                if (buffer_mapped(bh)) {
+                                if (end >= newex->ec_block +
+                                        newex->ec_len)
+                                        /* The buffer is out of
+                                         * the request range.
+                                         */
+                                        goto out;
+                                if (buffer_mapped(bh) &&
+                                    end >= newex->ec_block) {
+                                        start_index = index - 1;
                                        /* get the 1st mapped buffer. */
-                                        if (end > newex->ec_block +
-                                                newex->ec_len)
-                                                /* The buffer is out of
-                                                 * the request range.
-                                                 */
-                                                goto out;
                                        goto found_mapped_buffer;
                                }
                                bh = bh->b_this_page;
                                end++;
                        } while (bh != head);
-                        /* No mapped buffer found. */
+                        /* No mapped buffer in the range found in this page,
-                        goto out;
+                         * We need to look up next page.
+                         */
+                        if (index >= ret) {
+                                /* There is no page left, but we need to limit
+                                 * newex->ec_len.
+                                 */
+                                newex->ec_len = end - newex->ec_block;
+                                goto out;
+                        }
+                        goto next_page;
                } else {
                        /*Find contiguous delayed buffers. */
                        if (ret > 0 && pages[0]->index == last_offset)
                                head = page_buffers(pages[0]);
                        bh = head;
+                        index = 1;
+                        start_index = 0;
                }
 found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
                                end++;
                        } while (bh != head);
-                        for (index = 1; index < ret; index++) {
+                        for (; index < ret; index++) {
                                if (!page_has_buffers(pages[index])) {
                                        bh = NULL;
                                        break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
                                        bh = NULL;
                                        break;
                                }
                                if (pages[index]->index !=
-                                        pages[0]->index + index) {
+                                    pages[start_index]->index + index
+                                    - start_index) {
                                        /* Blocks are not contiguous. */
                                        bh = NULL;
                                        break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
        return (error < 0 ? error : 0);
 }
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct ext4_ext_cache cache_ex;
+        ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+        struct address_space *mapping = inode->i_mapping;
+        struct ext4_map_blocks map;
+        handle_t *handle;
+        loff_t first_block_offset, last_block_offset, block_len;
+        loff_t first_page, last_page, first_page_offset, last_page_offset;
+        int ret, credits, blocks_released, err = 0;
+        first_block = (offset + sb->s_blocksize - 1) >>
+                EXT4_BLOCK_SIZE_BITS(sb);
+        last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+        first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+        last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+        first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+        first_page_offset = first_page << PAGE_CACHE_SHIFT;
+        last_page_offset = last_page << PAGE_CACHE_SHIFT;
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                err = filemap_write_and_wait_range(mapping,
+                        first_page_offset == 0 ? 0 : first_page_offset-1,
+                        last_page_offset);
+                        if (err)
+                                return err;
+        }
+        /* Now release the pages */
+        if (last_page_offset > first_page_offset) {
+                truncate_inode_pages_range(mapping, first_page_offset,
+                                           last_page_offset-1);
+        }
+        /* finish any pending end_io work */
+        ext4_flush_completed_IO(inode);
+        credits = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, credits);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        err = ext4_orphan_add(handle, inode);
+        if (err)
+                goto out;
+        /*
+         * Now we need to zero out the un block aligned data.
+         * If the file is smaller than a block, just
+         * zero out the middle
+         */
+        if (first_block > last_block)
+                ext4_block_zero_page_range(handle, mapping, offset, length);
+        else {
+                /* zero out the head of the hole before the first block */
+                block_len  = first_block_offset - offset;
+                if (block_len > 0)
+                        ext4_block_zero_page_range(handle, mapping,
+                                                   offset, block_len);
+                /* zero out the tail of the hole after the last block */
+                block_len = offset + length - last_block_offset;
+                if (block_len > 0) {
+                        ext4_block_zero_page_range(handle, mapping,
+                                        last_block_offset, block_len);
+                }
+        }
+        /* If there are no blocks to remove, return now */
+        if (first_block >= last_block)
+                goto out;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_ext_invalidate_cache(inode);
+        ext4_discard_preallocations(inode);
+        /*
+         * Loop over all the blocks and identify blocks
+         * that need to be punched out
+         */
+        iblock = first_block;
+        blocks_released = 0;
+        while (iblock < last_block) {
+                max_blocks = last_block - iblock;
+                num_blocks = 1;
+                memset(&map, 0, sizeof(map));
+                map.m_lblk = iblock;
+                map.m_len = max_blocks;
+                ret = ext4_ext_map_blocks(handle, inode, &map,
+                        EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                if (ret > 0) {
+                        blocks_released += ret;
+                        num_blocks = ret;
+                } else if (ret == 0) {
+                        /*
+                         * If map blocks could not find the block,
+                         * then it is in a hole.  If the hole was
+                         * not already cached, then map blocks should
+                         * put it in the cache.  So we can get the hole
+                         * out of the cache
+                         */
+                        memset(&cache_ex, 0, sizeof(cache_ex));
+                        if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+                                !cache_ex.ec_start) {
+                                /* The hole is cached */
+                                num_blocks = cache_ex.ec_block +
+                                cache_ex.ec_len - iblock;
+                        } else {
+                                /* The block could not be identified */
+                                err = -EIO;
+                                break;
+                        }
+                } else {
+                        /* Map blocks error */
+                        err = ret;
+                        break;
+                }
+                if (num_blocks == 0) {
+                        /* This condition should never happen */
+                        ext_debug("Block lookup failed");
+                        err = -EIO;
+                        break;
+                }
+                iblock += num_blocks;
+        }
+        if (blocks_released > 0) {
+                ext4_ext_invalidate_cache(inode);
+                ext4_discard_preallocations(inode);
+        }
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        up_write(&EXT4_I(inode)->i_data_sem);
+out:
+        ext4_orphan_del(handle, inode);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+        return err;
+}
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        return error;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
 };
 const struct inode_operations ext4_file_inode_operations = {
-        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
 static void dump_completed_IO(struct inode * inode)
 {
-#ifdef  EXT4_DEBUG
+#ifdef  EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io, *io0, *io1;
        unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
        tid_t commit_tid;
+        bool needs_barrier = false;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
        }
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-        if (jbd2_log_start_commit(journal, commit_tid)) {
+        if (journal->j_flags & JBD2_BARRIER &&
-                /*
+            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-                 * When the journal is on a different device than the
+                needs_barrier = true;
-                 * fs data disk, we need to issue the barrier in
+        jbd2_log_start_commit(journal, commit_tid);
-                 * writeback mode.  (In ordered mode, the jbd2 layer
+        ret = jbd2_log_wait_commit(journal, commit_tid);
-                 * will take care of issuing the barrier.  In
+        if (needs_barrier)
-                 * data=journal, all of the data blocks are written to
-                 * the journal device.)
-                 */
-                if (ext4_should_writeback_data(inode) &&
-                    (journal->j_fs_dev != journal->j_dev) &&
-                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                        NULL);
-                ret = jbd2_log_wait_commit(journal, commit_tid);
-        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 out:
        trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode,
+                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                        goal, &count, err);
+                                                     0, &count, err);
                if (*err)
                        goto failed_out;
@@ -1930,7 +1930,7 @@ repeat:
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
                                continue;
                        }
-                        if (PageWriteback(page))
+                        wait_on_page_writeback(page);
-                                wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
                        if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
-                                vmtruncate(inode, isize);
+                                ext4_truncate_failed_write(inode);
                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
 int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, length, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
+        max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
        if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 int ext4_can_truncate(struct inode *inode)
 {
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
 }
 /*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (!S_ISREG(inode->i_mode))
+                return -ENOTSUPP;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                /* TODO: Add support for non extent hole punching */
+                return -ENOTSUPP;
+        }
+        return ext4_ext_punch_hole(file, offset, length);
+}
+/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        /*
         * Figure out the offset within the block group inode table
         */
-        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((inode->i_ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
-            (attr->ia_size < inode->i_size ||
+            (attr->ia_size < inode->i_size)) {
-             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                goto err_out;
                        }
                }
-                /* ext4_truncate will clear the flag */
-                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-                        ext4_truncate(inode);
        }
-        if ((attr->ia_valid & ATTR_SIZE) &&
+        if (attr->ia_valid & ATTR_SIZE) {
-            attr->ia_size != i_size_read(inode))
+                if (attr->ia_size != i_size_read(inode)) {
-                rc = vmtruncate(inode, attr->ia_size);
+                        truncate_setsize(inode, attr->ia_size);
+                        ext4_truncate(inode);
+                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                        ext4_truncate(inode);
+        }
        if (!rc) {
                setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out_unlock;
        }
        ret = 0;
-        if (PageMappedToDisk(page))
-                goto out_unlock;
+        lock_page(page);
+        wait_on_page_writeback(page);
+        if (PageMappedToDisk(page)) {
+                up_read(&inode->i_alloc_sem);
+                return VM_FAULT_LOCKED;
+        }
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        lock_page(page);
        /*
         * return if we have all the buffers mapped. This avoid
         * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                        ext4_bh_unmapped)) {
-                        unlock_page(page);
+                        up_read(&inode->i_alloc_sem);
-                        goto out_unlock;
+                        return VM_FAULT_LOCKED;
                }
        }
        unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret < 0)
                goto out_unlock;
        ret = 0;
+        /*
+         * write_begin/end might have created a dirty page and someone
+         * could wander in and start the IO.  Make sure that hasn't
+         * happened.
+         */
+        lock_page(page);
+        wait_on_page_writeback(page);
+        up_read(&inode->i_alloc_sem);
+        return VM_FAULT_LOCKED;
 out_unlock:
        if (ret)
                ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        struct inode *inode;
        char *data;
        char *bitmap;
+        struct ext4_group_info *grinfo;
        mb_debug(1, "init page %lu\n", page->index);
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (first_group + i >= ngroups)
                        break;
+                grinfo = ext4_get_group_info(sb, first_group + i);
+                /*
+                 * If page is uptodate then we came here after online resize
+                 * which added some new uninitialized group info structs, so
+                 * we must skip all initialized uptodate buddies on the page,
+                 * which may be currently in use by an allocating task.
+                 */
+                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+                        bh[i] = NULL;
+                        continue;
+                }
                err = -EIO;
                desc = ext4_get_group_desc(sb, first_group + i, NULL);
                if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        }
        /* wait for I/O completion */
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                wait_on_buffer(bh[i]);
+                if (bh[i])
+                        wait_on_buffer(bh[i]);
        err = -EIO;
-        for (i = 0; i < groups_per_page && bh[i]; i++)
+        for (i = 0; i < groups_per_page; i++)
-                if (!buffer_uptodate(bh[i]))
+                if (bh[i] && !buffer_uptodate(bh[i]))
                        goto out;
        err = 0;
        first_block = page->index * blocks_per_page;
-        /* init the page  */
-        memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
-                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;
+                if (!bh[group - first_group])
+                        /* skip initialized uptodate buddy */
+                        continue;
                /*
                 * data carry information regarding this
                 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
+                        /* init the buddy */
+                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 out:
        if (bh) {
-                for (i = 0; i < groups_per_page && bh[i]; i++)
+                for (i = 0; i < groups_per_page; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
@@ -957,22 +974,21 @@ out:
 }
 /*
- * lock the group_info alloc_sem of all the groups
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * belonging to the same buddy cache page. This
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
- * make sure other parallel operation on the buddy
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * cache doesn't happen  whild holding the buddy cache
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
- * lock
 */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
-                                        ext4_group_t group)
+                ext4_group_t group, struct ext4_buddy *e4b)
 {
-        int i;
+        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
-        int block, pnum;
+        int block, pnum, poff;
        int blocks_per_page;
-        int groups_per_page;
+        struct page *page;
-        ext4_group_t ngroups = ext4_get_groups_count(sb);
-        ext4_group_t first_group;
+        e4b->bd_buddy_page = NULL;
-        struct ext4_group_info *grp;
+        e4b->bd_bitmap_page = NULL;
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
         */
        block = group * 2;
        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        groups_per_page = blocks_per_page >> 1;
+        if (!page)
-        if (groups_per_page == 0)
+                return -EIO;
-                groups_per_page = 1;
+        BUG_ON(page->mapping != inode->i_mapping);
-        /* read all groups the page covers into the cache */
+        e4b->bd_bitmap_page = page;
-        for (i = 0; i < groups_per_page; i++) {
+        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
-                if ((first_group + i) >= ngroups)
+        if (blocks_per_page >= 2) {
-                        break;
+                /* buddy and bitmap are on the same page */
-                grp = ext4_get_group_info(sb, first_group + i);
+                return 0;
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                down_write_nested(&grp->alloc_sem, i);
        }
-        return i;
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (!page)
+                return -EIO;
+        BUG_ON(page->mapping != inode->i_mapping);
+        e4b->bd_buddy_page = page;
+        return 0;
 }
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
-                                         ext4_group_t group, int locked_group)
 {
-        int i;
+        if (e4b->bd_bitmap_page) {
-        int block, pnum;
+                unlock_page(e4b->bd_bitmap_page);
-        int blocks_per_page;
+                page_cache_release(e4b->bd_bitmap_page);
-        ext4_group_t first_group;
+        }
-        struct ext4_group_info *grp;
+        if (e4b->bd_buddy_page) {
+                unlock_page(e4b->bd_buddy_page);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+                page_cache_release(e4b->bd_buddy_page);
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        first_group = pnum * blocks_per_page / 2;
-        /* release locks on all the groups */
-        for (i = 0; i < locked_group; i++) {
-                grp = ext4_get_group_info(sb, first_group + i);
-                /* take all groups write allocation
-                 * semaphore. This make sure there is
-                 * no block allocation going on in any
-                 * of that groups
-                 */
-                up_write(&grp->alloc_sem);
        }
 }
 /*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
-        int ret = 0;
-        void *bitmap;
-        int blocks_per_page;
-        int block, pnum, poff;
-        int num_grp_locked = 0;
        struct ext4_group_info *this_grp;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
-        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page;
-        struct page *page = NULL, *bitmap_page = NULL;
+        int ret = 0;
        mb_debug(1, "init group %u\n", group);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        this_grp = ext4_get_group_info(sb, group);
        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
-         * would have taken the alloc_sem lock.
+         * would have pinned buddy page to page cache.
         */
-        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
-        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
-                ret = 0;
                goto err;
        }
-        /*
-         * the buddy cache inode stores the block bitmap
+        page = e4b.bd_bitmap_page;
-         * and buddy information in consecutive blocks.
+        ret = ext4_mb_init_cache(page, NULL);
-         * So for each group we need two blocks.
+        if (ret)
-         */
+                goto err;
-        block = group * 2;
+        if (!PageUptodate(page)) {
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page) {
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, NULL);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
-        }
-        if (page == NULL || !PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
-        bitmap_page = page;
-        bitmap = page_address(page) + (poff * sb->s_blocksize);
-        /* init buddy cache */
+        if (e4b.bd_buddy_page == NULL) {
-        block++;
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page == bitmap_page) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
-                unlock_page(page);
+                ret = 0;
-        } else if (page) {
+                goto err;
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, bitmap);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
        }
-        if (page == NULL || !PageUptodate(page)) {
+        /* init buddy cache */
+        page = e4b.bd_buddy_page;
+        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+        if (ret)
+                goto err;
+        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
        mark_page_accessed(page);
 err:
-        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        ext4_mb_put_buddy_page_lock(&e4b);
-        if (bitmap_page)
-                page_cache_release(bitmap_page);
-        if (page)
-                page_cache_release(page);
        return ret;
 }
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
-        e4b->alloc_semp = &grp->alloc_sem;
-        /* Take the read lock on the group alloc
-         * sem. This would make sure a parallel
-         * ext4_mb_init_group happening on other
-         * groups mapped by the page is blocked
-         * till we are done with allocation
-         */
-repeat_load_buddy:
-        down_read(e4b->alloc_semp);
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                /* we need to check for group need init flag
-                 * with alloc_semp held so that we can be sure
-                 * that new blocks didn't get added to the group
-                 * when we are loading the buddy cache
-                 */
-                up_read(e4b->alloc_semp);
                /*
                 * we need full data about the group
                 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
                ret = ext4_mb_init_group(sb, group);
                if (ret)
                        return ret;
-                goto repeat_load_buddy;
        }
        /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
        return 0;
 err:
+        if (page)
+                page_cache_release(page);
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
-        /* Done with the buddy cache */
-        up_read(e4b->alloc_semp);
        return ret;
 }
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
-        /* Done with the buddy cache */
-        if (e4b->alloc_semp)
-                up_read(e4b->alloc_semp);
 }
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
-        /* on allocation we use ac to track the held semaphore */
-        ac->alloc_semp =  e4b->alloc_semp;
-        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct super_block *sb = journal->j_private;
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
-        int err, ret, count = 0, count2 = 0;
+        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
        struct list_head *l, *ltmp;
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD)) {
+                if (test_opt(sb, DISCARD))
-                        ret = ext4_issue_discard(sb, entry->group,
+                        ext4_issue_discard(sb, entry->group,
-                                        entry->start_blk, entry->count);
+                                           entry->start_blk, entry->count);
-                        if (unlikely(ret == -EOPNOTSUPP)) {
-                                ext4_warning(sb, "discard not supported, "
-                                                 "disabling");
-                                clear_opt(sb, DISCARD);
-                        }
-                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        spin_unlock(&pa->pa_lock);
                }
        }
-        if (ac->alloc_semp)
-                up_read(ac->alloc_semp);
        if (pa) {
                /*
                 * We want to add the pa to the right bucket.
                 * Remove it from the list and while adding
                 * make sure the list to which we are adding
-                 * doesn't grow big.  We need to release
+                 * doesn't grow big.
-                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
-                while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+                while (ar->len &&
+                        ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
                        /* let others to free the space */
                        yield();
                        ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        return 0;
                }
                reserv_blks = ar->len;
-                while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
+                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
-                        ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                        dquot_alloc_block_nofail(ar->inode, ar->len);
-                        ar->len--;
+                } else {
+                        while (ar->len &&
+                                dquot_alloc_block(ar->inode, ar->len)) {
+                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                                ar->len--;
+                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
 }
 /**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                     handle to this transaction
+ * @sb:                         super block
+ * @block:                      start physcial block to add to the block group
+ * @count:                      number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                         ext4_fsblk_t block, unsigned long count)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *gd_bh;
+        ext4_group_t block_group;
+        ext4_grpblk_t bit;
+        unsigned int i;
+        struct ext4_group_desc *desc;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_buddy e4b;
+        int err = 0, ret, blk_free_count;
+        ext4_grpblk_t blocks_freed;
+        struct ext4_group_info *grp;
+        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        grp = ext4_get_group_info(sb, block_group);
+        /*
+         * Check to see if we are freeing blocks across a group
+         * boundary.
+         */
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+                goto error_return;
+        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+        if (!desc)
+                goto error_return;
+        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+            in_range(block + count - 1, ext4_inode_table(sb, desc),
+                     sbi->s_itb_per_group)) {
+                ext4_error(sb, "Adding blocks in system zones - "
+                           "Block = %llu, count = %lu",
+                           block, count);
+                goto error_return;
+        }
+        BUFFER_TRACE(bitmap_bh, "getting write access");
+        err = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (err)
+                goto error_return;
+        /*
+         * We are about to modify some metadata.  Call the journal APIs
+         * to unshare ->b_data if a currently-committing transaction is
+         * using it
+         */
+        BUFFER_TRACE(gd_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, gd_bh);
+        if (err)
+                goto error_return;
+        for (i = 0, blocks_freed = 0; i < count; i++) {
+                BUFFER_TRACE(bitmap_bh, "clear bit");
+                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                        ext4_error(sb, "bit already cleared for block %llu",
+                                   (ext4_fsblk_t)(block + i));
+                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                } else {
+                        blocks_freed++;
+                }
+        }
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
+        if (err)
+                goto error_return;
+        /*
+         * need to update group_info->bb_free and bitmap
+         * with group lock held. generate_buddy look at
+         * them with group lock_held
+         */
+        ext4_lock_group(sb, block_group);
+        mb_clear_bits(bitmap_bh->b_data, bit, count);
+        mb_free_blocks(NULL, &e4b, bit, count);
+        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        ext4_free_blks_set(sb, desc, blk_free_count);
+        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+        ext4_unlock_group(sb, block_group);
+        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+                atomic_add(blocks_freed,
+                           &sbi->s_flex_groups[flex_group].free_blocks);
+        }
+        ext4_mb_unload_buddy(&e4b);
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        /* And the group descriptor block */
+        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+        if (!err)
+                err = ret;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, err);
+        return;
+}
+/**
 * ext4_trim_extent -- function to TRIM one single free extent in the group
 * @sb:         super block for the file system
 * @start:      starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
-                ext4_group_t group, struct ext4_buddy *e4b)
+                             ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
-        int ret = 0;
        assert_spin_locked(ext4_group_lock_ptr(sb, group));
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
+        ext4_issue_discard(sb, group, start, count);
-        ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
-        return ret;
 }
 /**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 * the group buddy bitmap. This is done until whole group is scanned.
 */
 static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
-                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+                   ext4_grpblk_t start, ext4_grpblk_t max,
+                   ext4_grpblk_t minblocks)
 {
        void *bitmap;
        ext4_grpblk_t next, count = 0;
-        ext4_group_t group;
+        struct ext4_buddy e4b;
-        int ret = 0;
+        int ret;
-        BUG_ON(e4b == NULL);
+        ret = ext4_mb_load_buddy(sb, group, &e4b);
+        if (ret) {
+                ext4_error(sb, "Error in loading buddy "
+                                "information for %u", group);
+                return ret;
+        }
+        bitmap = e4b.bd_bitmap;
-        bitmap = e4b->bd_bitmap;
-        group = e4b->bd_group;
-        start = (e4b->bd_info->bb_first_free > start) ?
-                e4b->bd_info->bb_first_free : start;
        ext4_lock_group(sb, group);
+        start = (e4b.bd_info->bb_first_free > start) ?
+                e4b.bd_info->bb_first_free : start;
        while (start < max) {
                start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                next = mb_find_next_bit(bitmap, max, start);
                if ((next - start) >= minblocks) {
-                        ret = ext4_trim_extent(sb, start,
+                        ext4_trim_extent(sb, start,
-                                next - start, group, e4b);
+                                         next - start, group, &e4b);
-                        if (ret < 0)
-                                break;
                        count += next - start;
                }
                start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                        ext4_lock_group(sb, group);
                }
-                if ((e4b->bd_info->bb_free - count) < minblocks)
+                if ((e4b.bd_info->bb_free - count) < minblocks)
                        break;
        }
        ext4_unlock_group(sb, group);
+        ext4_mb_unload_buddy(&e4b);
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
-        if (ret < 0)
-                count = ret;
        return count;
 }
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 */
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
-        struct ext4_buddy e4b;
+        struct ext4_group_info *grp;
        ext4_group_t first_group, last_group;
        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
        ext4_grpblk_t cnt = 0, first_block, last_block;
-        uint64_t start, len, minlen, trimmed;
+        uint64_t start, len, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        start = range->start >> sb->s_blocksize_bits;
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        trimmed = 0;
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                return -EINVAL;
        for (group = first_group; group <= last_group; group++) {
-                ret = ext4_mb_load_buddy(sb, group, &e4b);
+                grp = ext4_get_group_info(sb, group);
-                if (ret) {
+                /* We only do this if the grp has never been initialized */
-                        ext4_error(sb, "Error in loading buddy "
+                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                                        "information for %u", group);
+                        ret = ext4_mb_init_group(sb, group);
-                        break;
+                        if (ret)
+                                break;
                }
                /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                        last_block = first_block + len;
                len -= last_block - first_block;
-                if (e4b.bd_info->bb_free >= minlen) {
+                if (grp->bb_free >= minlen) {
-                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                        cnt = ext4_trim_all_free(sb, group, first_block,
                                                last_block, minlen);
                        if (cnt < 0) {
                                ret = cnt;
-                                ext4_mb_unload_buddy(&e4b);
                                break;
                        }
                }
-                ext4_mb_unload_buddy(&e4b);
                trimmed += cnt;
                first_block = 0;
        }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
-        /*
-         * pointer to the held semaphore upon successful
-         * block allocation
-         */
-        struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -215,7 +210,6 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
-        struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
         */
-        ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
        /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+#include "ext4.h"
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+        mark_buffer_dirty(bh);
+        lock_buffer(bh);
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(WRITE_SYNC, bh);
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh)))
+                return 1;
+        return 0;
+}
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+                          ext4_fsblk_t mmp_block)
+{
+        struct mmp_struct *mmp;
+        if (*bh)
+                clear_buffer_uptodate(*bh);
+        /* This would be sb_bread(sb, mmp_block), except we need to be sure
+         * that the MD RAID device cache has been bypassed, and that the read
+         * is not blocked in the elevator. */
+        if (!*bh)
+                *bh = sb_getblk(sb, mmp_block);
+        if (*bh) {
+                get_bh(*bh);
+                lock_buffer(*bh);
+                (*bh)->b_end_io = end_buffer_read_sync;
+                submit_bh(READ_SYNC, *bh);
+                wait_on_buffer(*bh);
+                if (!buffer_uptodate(*bh)) {
+                        brelse(*bh);
+                        *bh = NULL;
+                }
+        }
+        if (!*bh) {
+                ext4_warning(sb, "Error while reading MMP block %llu",
+                             mmp_block);
+                return -EIO;
+        }
+        mmp = (struct mmp_struct *)((*bh)->b_data);
+        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+                return -EINVAL;
+        return 0;
+}
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+                    const char *function, unsigned int line, const char *msg)
+{
+        __ext4_warning(sb, function, line, msg);
+        __ext4_warning(sb, function, line,
+                       "MMP failure info: last update time: %llu, last update "
+                       "node: %s, last update device: %s\n",
+                       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+                       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+        struct super_block *sb = ((struct mmpd_data *) data)->sb;
+        struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct mmp_struct *mmp;
+        ext4_fsblk_t mmp_block;
+        u32 seq = 0;
+        unsigned long failed_writes = 0;
+        int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned mmp_check_interval;
+        unsigned long last_update_time;
+        unsigned long diff;
+        int retval;
+        mmp_block = le64_to_cpu(es->s_mmp_block);
+        mmp = (struct mmp_struct *)(bh->b_data);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        /*
+         * Start with the higher mmp_check_interval and reduce it if
+         * the MMP block is being updated on time.
+         */
+        mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+                                 EXT4_MMP_MIN_CHECK_INTERVAL);
+        mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        bdevname(bh->b_bdev, mmp->mmp_bdevname);
+        memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+               sizeof(mmp->mmp_nodename));
+        while (!kthread_should_stop()) {
+                if (++seq > EXT4_MMP_SEQ_MAX)
+                        seq = 1;
+                mmp->mmp_seq = cpu_to_le32(seq);
+                mmp->mmp_time = cpu_to_le64(get_seconds());
+                last_update_time = jiffies;
+                retval = write_mmp_block(bh);
+                /*
+                 * Don't spew too many error messages. Print one every
+                 * (s_mmp_update_interval * 60) seconds.
+                 */
+                if (retval && (failed_writes % 60) == 0) {
+                        ext4_error(sb, "Error writing to MMP block");
+                        failed_writes++;
+                }
+                if (!(le32_to_cpu(es->s_feature_incompat) &
+                    EXT4_FEATURE_INCOMPAT_MMP)) {
+                        ext4_warning(sb, "kmmpd being stopped since MMP feature"
+                                     " has been disabled.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                if (sb->s_flags & MS_RDONLY) {
+                        ext4_warning(sb, "kmmpd being stopped since filesystem "
+                                     "has been remounted as readonly.");
+                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto failed;
+                }
+                diff = jiffies - last_update_time;
+                if (diff < mmp_update_interval * HZ)
+                        schedule_timeout_interruptible(mmp_update_interval *
+                                                       HZ - diff);
+                /*
+                 * We need to make sure that more than mmp_check_interval
+                 * seconds have not passed since writing. If that has happened
+                 * we need to check if the MMP block is as we left it.
+                 */
+                diff = jiffies - last_update_time;
+                if (diff > mmp_check_interval * HZ) {
+                        struct buffer_head *bh_check = NULL;
+                        struct mmp_struct *mmp_check;
+                        retval = read_mmp_block(sb, &bh_check, mmp_block);
+                        if (retval) {
+                                ext4_error(sb, "error reading MMP data: %d",
+                                           retval);
+                                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                                goto failed;
+                        }
+                        mmp_check = (struct mmp_struct *)(bh_check->b_data);
+                        if (mmp->mmp_seq != mmp_check->mmp_seq ||
+                            memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+                                   sizeof(mmp->mmp_nodename))) {
+                                dump_mmp_msg(sb, mmp_check,
+                                             "Error while updating MMP info. "
+                                             "The filesystem seems to have been"
+                                             " multiply mounted.");
+                                ext4_error(sb, "abort");
+                                goto failed;
+                        }
+                        put_bh(bh_check);
+                }
+                 /*
+                 * Adjust the mmp_check_interval depending on how much time
+                 * it took for the MMP block to be written.
+                 */
+                mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+                                             EXT4_MMP_MAX_CHECK_INTERVAL),
+                                         EXT4_MMP_MIN_CHECK_INTERVAL);
+                mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+        }
+        /*
+         * Unmount seems to be clean.
+         */
+        mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+        mmp->mmp_time = cpu_to_le64(get_seconds());
+        retval = write_mmp_block(bh);
+failed:
+        kfree(data);
+        brelse(bh);
+        return retval;
+}
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+        u32 new_seq;
+        do {
+                get_random_bytes(&new_seq, sizeof(u32));
+        } while (new_seq > EXT4_MMP_SEQ_MAX);
+        return new_seq;
+}
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+                                    ext4_fsblk_t mmp_block)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct buffer_head *bh = NULL;
+        struct mmp_struct *mmp = NULL;
+        struct mmpd_data *mmpd_data;
+        u32 seq;
+        unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+        unsigned int wait_time = 0;
+        int retval;
+        if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+            mmp_block >= ext4_blocks_count(es)) {
+                ext4_warning(sb, "Invalid MMP block in superblock");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+                mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+        /*
+         * If check_interval in MMP block is larger, use that instead of
+         * update_interval from the superblock.
+         */
+        if (mmp->mmp_check_interval > mmp_check_interval)
+                mmp_check_interval = mmp->mmp_check_interval;
+        seq = le32_to_cpu(mmp->mmp_seq);
+        if (seq == EXT4_MMP_SEQ_CLEAN)
+                goto skip;
+        if (seq == EXT4_MMP_SEQ_FSCK) {
+                dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+                goto failed;
+        }
+        wait_time = min(mmp_check_interval * 2 + 1,
+                        mmp_check_interval + 60);
+        /* Print MMP interval if more than 20 secs. */
+        if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+                ext4_warning(sb, "MMP interval %u higher than expected, please"
+                             " wait.\n", wait_time * 2);
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+skip:
+        /*
+         * write a new random sequence number.
+         */
+        mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+        retval = write_mmp_block(bh);
+        if (retval)
+                goto failed;
+        /*
+         * wait for MMP interval and check mmp_seq.
+         */
+        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+                ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+                goto failed;
+        }
+        retval = read_mmp_block(sb, &bh, mmp_block);
+        if (retval)
+                goto failed;
+        mmp = (struct mmp_struct *)(bh->b_data);
+        if (seq != le32_to_cpu(mmp->mmp_seq)) {
+                dump_mmp_msg(sb, mmp,
+                             "Device is already active on another node.");
+                goto failed;
+        }
+        mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+        if (!mmpd_data) {
+                ext4_warning(sb, "not enough memory for mmpd_data");
+                goto failed;
+        }
+        mmpd_data->sb = sb;
+        mmpd_data->bh = bh;
+        /*
+         * Start a kernel thread to update the MMP block periodically.
+         */
+        EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+                                             bdevname(bh->b_bdev,
+                                                      mmp->mmp_bdevname));
+        if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+                EXT4_SB(sb)->s_mmp_tsk = NULL;
+                kfree(mmpd_data);
+                ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+                             sb->s_id);
+                goto failed;
+        }
+        return 0;
+failed:
+        brelse(bh);
+        return 1;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * It needs to call wait_on_page_writeback() to wait for the
         * writeback of the page.
         */
-        if (PageWriteback(page))
+        wait_on_page_writeback(page);
-                wait_on_page_writeback(page);
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->at = entries;
        frame->bh = bh;
        bh = bh2;
+        ext4_handle_dirty_metadata(handle, dir, frame->bh);
+        ext4_handle_dirty_metadata(handle, dir, bh);
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-        dx_release (frames);
+        if (!de) {
-        if (!(de))
+                /*
+                 * Even if the block split failed, we have to properly write
+                 * out all the changes we did so far. Otherwise we can end up
+                 * with corrupted filesystem.
+                 */
+                ext4_mark_inode_dirty(handle, dir);
+                dx_release(frames);
                return retval;
+        }
+        dx_release(frames);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
        handle_t *handle;
        struct inode *inode;
        int l, err, retries = 0;
+        int credits;
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
        dquot_initialize(dir);
+        if (l > EXT4_N_BLOCKS * 4) {
+                /*
+                 * For non-fast symlinks, we just allocate inode and put it on
+                 * orphan list in the first transaction => we need bitmap,
+                 * group descriptor, sb, inode block, quota blocks.
+                 */
+                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        } else {
+                /*
+                 * Fast symlink. We have to add entry to directory
+                 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+                 * allocate new inode (bitmap, group descriptor, inode block,
+                 * quota blocks, sb is already counted in previous macros).
+                 */
+                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                          EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+        }
 retry:
-        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+        handle = ext4_journal_start(dir, credits);
-                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2263,21 +2292,44 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof(EXT4_I(inode)->i_data)) {
+        if (l > EXT4_N_BLOCKS * 4) {
                inode->i_op = &ext4_symlink_inode_operations;
                ext4_set_aops(inode);
                /*
-                 * page_symlink() calls into ext4_prepare/commit_write.
+                 * We cannot call page_symlink() with transaction started
-                 * We have a transaction open.  All is sweetness.  It also sets
+                 * because it calls into ext4_write_begin() which can wait
-                 * i_size in generic_commit_write().
+                 * for transaction commit if we are running out of space
+                 * and thus we deadlock. So we have to stop transaction now
+                 * and restart it when symlink contents is written.
+                 * 
+                 * To keep fs consistent in case of crash, we have to put inode
+                 * to orphan list in the mean time.
                 */
+                drop_nlink(inode);
+                err = ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (err)
+                        goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+                if (err)
+                        goto err_drop_inode;
+                /*
+                 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+                 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                 */
+                handle = ext4_journal_start(dir,
+                                EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto err_drop_inode;
+                }
+                inc_nlink(inode);
+                err = ext4_orphan_del(handle, inode);
                if (err) {
+                        ext4_journal_stop(handle);
                        clear_nlink(inode);
-                        unlock_new_inode(inode);
+                        goto err_drop_inode;
-                        ext4_mark_inode_dirty(handle, inode);
-                        iput(inode);
-                        goto out_stop;
                }
        } else {
                /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+        unlock_new_inode(inode);
+        iput(inode);
+        return err;
 }
 static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
        for (i = 0; i < io_end->num_io_pages; i++) {
                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
-                int partial_write = 0;
+                loff_t offset;
+                loff_t io_end_offset;
-                head = page_buffers(page);
+                if (error) {
-                if (error)
                        SetPageError(page);
-                BUG_ON(!head);
+                        set_bit(AS_EIO, &page->mapping->flags);
-                if (head->b_size != PAGE_CACHE_SIZE) {
+                        head = page_buffers(page);
-                        loff_t offset;
+                        BUG_ON(!head);
-                        loff_t io_end_offset = io_end->offset + io_end->size;
+                        io_end_offset = io_end->offset + io_end->size;
                        offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
                        bh = head;
                        do {
                                if ((offset >= io_end->offset) &&
-                                    (offset+bh->b_size <= io_end_offset)) {
+                                    (offset+bh->b_size <= io_end_offset))
-                                        if (error)
+                                        buffer_io_error(bh);
-                                                buffer_io_error(bh);
-                                }
-                                if (buffer_delay(bh))
-                                        partial_write = 1;
-                                else if (!buffer_mapped(bh))
-                                        clear_buffer_dirty(bh);
-                                else if (buffer_dirty(bh))
-                                        partial_write = 1;
                                offset += bh->b_size;
                                bh = bh->b_this_page;
                        } while (bh != head);
                }
-                /*
-                 * If this is a partial write which happened to make
-                 * all buffers uptodate then we can optimize away a
-                 * bogus readpage() for the next read(). Here we
-                 * 'discover' whether the page went uptodate as a
-                 * result of this (potentially partial) write.
-                 */
-                if (!partial_write)
-                        SetPageUptodate(page);
                put_io_page(io_end->pages[i]);
        }
        io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .mount          = ext4_mount,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
        .owner          = THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, INIT_INODE_TABLE))
                seq_puts(seq, ",noinit_inode_table");
-        else if (sbi->s_li_wait_mult)
+        else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
                seq_printf(seq, ",init_inode_table=%u",
                           (unsigned) sbi->s_li_wait_mult);
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
 static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
        .get_reserved_space = ext4_get_reserved_space,
-#endif
        .write_dquot    = ext4_write_dquot,
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
-        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);
+        cleancache_init_fs(sb);
        return res;
 }
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+                                      struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+                                        struct ext4_sb_info *sbi, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                          struct ext4_sb_info *sbi,
                                          const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(extent_cache_hits),
+        ATTR_LIST(extent_cache_misses),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
 }
-static void ext4_lazyinode_timeout(unsigned long data)
-{
-        struct task_struct *p = (struct task_struct *)data;
-        wake_up_process(p);
-}
 /* Find next suitable group and run ext4_init_inode_table */
 static int ext4_run_li_request(struct ext4_li_request *elr)
 {
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                if (elr->lr_timeout == 0) {
-                        timeout = jiffies - timeout;
+                        timeout = (jiffies - timeout) *
-                        if (elr->lr_sbi->s_li_wait_mult)
+                                  elr->lr_sbi->s_li_wait_mult;
-                                timeout *= elr->lr_sbi->s_li_wait_mult;
-                        else
-                                timeout *= 20;
                        elr->lr_timeout = timeout;
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 /*
 * Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
 */
 static void ext4_remove_li_request(struct ext4_li_request *elr)
 {
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
 static void ext4_unregister_li_request(struct super_block *sb)
 {
-        struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+        mutex_lock(&ext4_li_mtx);
+        if (!ext4_li_info) {
-        if (!ext4_li_info)
+                mutex_unlock(&ext4_li_mtx);
                return;
+        }
        mutex_lock(&ext4_li_info->li_list_mtx);
-        ext4_remove_li_request(elr);
+        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
+        mutex_unlock(&ext4_li_mtx);
 }
 static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
        struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
-        unsigned long next_wakeup;
+        unsigned long next_wakeup, cur;
-        DEFINE_WAIT(wait);
        BUG_ON(NULL == eli);
-        eli->li_timer.data = (unsigned long)current;
-        eli->li_timer.function = ext4_lazyinode_timeout;
-        eli->li_task = current;
-        wake_up(&eli->li_wait_task);
 cont_thread:
        while (true) {
                next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
                if (freezing(current))
                        refrigerator();
-                if ((time_after_eq(jiffies, next_wakeup)) ||
+                cur = jiffies;
+                if ((time_after_eq(cur, next_wakeup)) ||
                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }
-                eli->li_timer.expires = next_wakeup;
+                schedule_timeout_interruptible(next_wakeup - cur);
-                add_timer(&eli->li_timer);
-                prepare_to_wait(&eli->li_wait_daemon, &wait,
-                                TASK_INTERRUPTIBLE);
-                if (time_before(jiffies, next_wakeup))
-                        schedule();
-                finish_wait(&eli->li_wait_daemon, &wait);
                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
-        del_timer_sync(&ext4_li_info->li_timer);
-        eli->li_task = NULL;
-        wake_up(&eli->li_wait_task);
        kfree(ext4_li_info);
-        ext4_lazyinit_task = NULL;
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
-                del_timer_sync(&ext4_li_info->li_timer);
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-        wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
        return 0;
 }
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
        if (!eli)
                return -ENOMEM;
-        eli->li_task = NULL;
        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);
-        init_waitqueue_head(&eli->li_wait_daemon);
-        init_waitqueue_head(&eli->li_wait_task);
-        init_timer(&eli->li_timer);
        eli->li_state |= EXT4_LAZYINIT_QUIT;
        ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        int ret = 0;
-        if (sbi->s_li_request != NULL)
+        if (sbi->s_li_request != NULL) {
+                /*
+                 * Reset timeout so it can be computed again, because
+                 * s_li_wait_mult might have changed.
+                 */
+                sbi->s_li_request->lr_timeout = 0;
                return 0;
+        }
        if (first_not_zeroed == ngroups ||
            (sb->s_flags & MS_RDONLY) ||
-            !test_opt(sb, INIT_INODE_TABLE)) {
+            !test_opt(sb, INIT_INODE_TABLE))
-                sbi->s_li_request = NULL;
                return 0;
-        }
-        if (first_not_zeroed == ngroups) {
-                sbi->s_li_request = NULL;
-                return 0;
-        }
        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);
+        /*
+         * set default s_li_wait_mult for lazyinit, for the case there is
+         * no mount option specified.
+         */
+        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                           &journal_devnum, &journal_ioprio, NULL, 0)) {
                ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
+        if (IS_EXT2_SB(sb)) {
+                if (ext2_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
+        if (IS_EXT3_SB(sb)) {
+                if (ext3_feature_set_ok(sb))
+                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+                                 "using the ext4 subsystem");
+                else {
+                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+                                 "to feature incompatibilities");
+                        goto failed_mount;
+                }
+        }
        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                          EXT4_HAS_INCOMPAT_FEATURE(sb,
                                    EXT4_FEATURE_INCOMPAT_RECOVER));
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+            !(sb->s_flags & MS_RDONLY))
+                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+                        goto failed_mount3;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount_wq;
        } else {
                clear_opt(sb, DATA_FLAGS);
-                set_opt(sb, WRITEBACK_DATA);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+        if (sbi->s_mmp_tsk)
+                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-        int err;
+        int err = 0;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                                     EXT4_FEATURE_INCOMPAT_MMP))
+                                if (ext4_multi_mount_protect(sb,
+                                                le64_to_cpu(es->s_mmp_block))) {
+                                        err = -EROFS;
+                                        goto restore_opts;
+                                }
                        enable_quota = 1;
                }
        }
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        u64 fsid;
+        s64 bfree;
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+        bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+        /* prevent underflow in case that few free space is available */
+        buf->f_bfree = max_t(s64, bfree, 0);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        if (!inode)
+                goto out;
        /* Update modification times of quota files when userspace can
         * start looking at them */
        handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 }
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "ext2",
-        .mount          = ext4_mount,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
 static inline void register_as_ext2(void)
 {
        int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
 {
        unregister_filesystem(&ext2_fs_type);
 }
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
 {
        unregister_filesystem(&ext3_fs_type);
 }
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+                return 0;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                return 0;
+        if (sb->s_flags & MS_RDONLY)
+                return 1;
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+                return 0;
+        return 1;
+}
 MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
-        register_as_ext2();
        register_as_ext3();
+        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-                        block = ext4_new_meta_blocks(handle, inode,
+                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
-                                                  goal, NULL, &error);
+                                                     NULL, &error);
                        if (error)
                                goto cleanup;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b222dafd15b..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
+        dentry_unhash(dentry);
        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        err = fat_scan(old_dir, old_name, &old_sinfo);
        if (err) {
                err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 20b4ea53fdc4..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
+        dentry_unhash(dentry);
        lock_super(sb);
        err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err, is_dir, update_dotdot, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        dentry_unhash(entry);
        req->in.h.opcode = FUSE_RMDIR;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        struct fuse_rename_in inarg;
        struct fuse_conn *fc = get_fuse_conn(olddir);
        struct fuse_req *req = fuse_get_req(fc);
+        if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
+                dentry_unhash(newent);
        if (IS_ERR(req))
                return PTR_ERR(req);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
+        if (S_ISDIR(inode->i_mode))
+                dentry_unhash(dentry);
        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
+                if (S_ISDIR(new_dentry->d_inode->i_mode))
+                        dentry_unhash(new_dentry);
                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
+        dentry_unhash(dentry);
        if (inode->i_size != 2)
                return -ENOTEMPTY;
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                if (S_ISDIR(new_dentry->d_inode->i_mode))
+                if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+                        dentry_unhash(new_dentry);
                        res = hfsplus_rmdir(new_dir, new_dentry);
-                else
+                } else {
                        res = hfsplus_unlink(new_dir, new_dentry);
+                }
                if (res)
                        return res;
        }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
        char *file;
        int err;
+        dentry_unhash(dentry);
        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        char *from_name, *to_name;
        int err;
+        if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
+                dentry_unhash(to);
        if ((from_name = dentry_name(from)) == NULL)
                return -ENOMEM;
        if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
                dentry_unhash(dentry);
                if (!d_unhashed(dentry)) {
-                        dput(dentry);
                        hpfs_unlock(dir->i_sb);
                        return -ENOSPC;
                }
@@ -403,7 +402,6 @@ again:
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
-                        dput(dentry);
                } else {
                        struct iattr newattrs;
                        /*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
                        newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
                        err = notify_change(dentry, &newattrs);
                        put_write_access(inode);
-                        dput(dentry);
                        if (!err)
                                goto again;
                }
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err;
        int r;
+        dentry_unhash(dentry);
        hpfs_adjust_length(name, &len);
        hpfs_lock(dir->i_sb);
        err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh;
        struct fnode *fnode;
        int err;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        if ((err = hpfs_chk_name(new_name, &new_len))) return err;
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e7a035781b7d..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+                                vm_flags_t acctflag,
                                struct user_struct **user, int creat_flags)
 {
        int error = -ENOMEM;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a81c783..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
-                commit_transaction->t_flushed_data_blocks = 1;
                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                smp_mb__after_clear_bit();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
                err = 0;
        }
+        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_COMMIT_DFLUSH;
+        write_unlock(&journal->j_state_lock);
        /* 
         * If the journal is not located on the file system device,
         * then we must flush the file system device before we issue
         * the commit record
         */
-        if (commit_transaction->t_flushed_data_blocks &&
+        if (commit_transaction->t_need_data_flush &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
                   required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-                /* Wake up any transactions which were waiting for this
+                /*
-                   IO to complete */
+                 * Wake up any transactions which were waiting for this IO to
+                 * complete. The barrier must be here so that changes by
+                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
+                 * does the waitqueue check.
+                 */
+                smp_mb();
                wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
                jbd2_journal_abort(journal, err);
        jbd_debug(3, "JBD: commit phase 5\n");
+        write_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+        commit_transaction->t_state = T_COMMIT_JFLUSH;
+        write_unlock(&journal->j_state_lock);
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
        jbd_debug(3, "JBD: commit phase 7\n");
-        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
        commit_transaction->t_start = jiffies;
        stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
        /*
-         * Are we already doing a recent enough commit?
+         * The only transaction we can possibly wait upon is the
+         * currently running transaction (if it exists).  Otherwise,
+         * the target tid must be an old one.
         */
-        if (!tid_geq(journal->j_commit_request, target)) {
+        if (journal->j_running_transaction &&
+            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                          journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
                return 1;
-        }
+        } else if (!tid_geq(journal->j_commit_request, target))
+                /* This should never happen, but if it does, preserve
+                   the evidence before kjournald goes into a loop and
+                   increments j_commit_sequence beyond all recognition. */
+                WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                          journal->j_commit_request,
+                          journal->j_commit_sequence,
+                          target, journal->j_running_transaction ? 
+                          journal->j_running_transaction->t_tid : 0);
        return 0;
 }
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 }
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+        int ret = 0;
+        transaction_t *commit_trans;
+        if (!(journal->j_flags & JBD2_BARRIER))
+                return 0;
+        read_lock(&journal->j_state_lock);
+        /* Transaction already committed? */
+        if (tid_geq(journal->j_commit_sequence, tid))
+                goto out;
+        commit_trans = journal->j_committing_transaction;
+        if (!commit_trans || commit_trans->t_tid != tid) {
+                ret = 1;
+                goto out;
+        }
+        /*
+         * Transaction is being committed and we already proceeded to
+         * submitting a flush to fs partition?
+         */
+        if (journal->j_fs_dev != journal->j_dev) {
+                if (!commit_trans->t_need_data_flush ||
+                    commit_trans->t_state >= T_COMMIT_DFLUSH)
+                        goto out;
+        } else {
+                if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+                        goto out;
+        }
+        ret = 1;
+out:
+        read_unlock(&journal->j_state_lock);
+        return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+/*
 * Wait for a specified commit to complete.
 * The caller may not hold the journal lock.
 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 */
 /*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
 *
 * In order for t_max_wait to be reliable, it must be protected by a
 * lock.  But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 * means that maximum wait time reported by the jbd2_run_stats
 * tracepoint will always be zero.
 */
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+                                     unsigned long ts)
 {
 #ifdef CONFIG_JBD2_DEBUG
-        unsigned long ts = jiffies;
        if (jbd2_journal_enable_debug &&
            time_after(transaction->t_start, ts)) {
                ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
        tid_t           tid;
        int             needed, need_to_start;
        int             nblocks = handle->h_buffer_credits;
+        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction. 
         */
-        update_t_max_wait(transaction);
+        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
 */
 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        jbd2_journal_cancel_revoke(handle, jh);
-        jbd2_journal_put_journal_head(jh);
 out:
+        jbd2_journal_put_journal_head(jh);
        return err;
 }
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
            jinode->i_next_transaction == transaction)
                goto done;
+        /*
+         * We only ever set this variable to 1 so the test is safe. Since
+         * t_need_data_flush is likely to be set, we do the test to save some
+         * cacheline bouncing
+         */
+        if (!transaction->t_need_data_flush)
+                transaction->t_need_data_flush = 1;
        /* On some different transaction's list - should be
         * the committing one */
        if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
        int ret;
        uint32_t now = get_seconds();
+        dentry_unhash(dentry);
        for (fd = f->dents ; fd; fd = fd->next) {
                if (fd->ino)
                        return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
        uint8_t type;
        uint32_t now;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* The VFS will check for us and prevent trying to rename a
         * file over a directory and vice versa, but if it's a directory,
         * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
        jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+        dentry_unhash(dentry);
        /* Init inode for quota operations. */
        dquot_initialize(dip);
        dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
                 new_dentry->d_name.name);
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        dquot_initialize(old_dir);
        dquot_initialize(new_dir);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        dentry_unhash(dentry);
        if (!logfs_empty_dir(inode))
                return -ENOTEMPTY;
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
        loff_t pos;
        int err;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* 1. locate source dd */
        err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
        if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err = -ENOTEMPTY;
+        dentry_unhash(dentry);
        if (minix_empty_dir(inode)) {
                err = minix_unlink(dir, dentry);
                if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
        struct minix_dir_entry * old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = minix_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809e46e0..fdfae9fa98cd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>
 /*
 * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                SetPageMappedToDisk(page);
        }
+        if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+            cleancache_get_page(page) == 0) {
+                SetPageUptodate(page);
+                goto confused;
+        }
        /*
         * This page will go to BIO.  Do we need to send this BIO off first?
         */
diff --git a/fs/namei.c b/fs/namei.c
index 6ff858c049c0..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
-/**
+/*
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
 * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
- * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
- * refcounts at the last known good point before rcu-walk got stuck, so
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * got stuck, so ref-walk may continue from there. If this is not successful
- * has changed), then failure is returned and path walk restarts from the
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
- * beginning in ref-walk mode.
+ * to restart the path walk from the beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
 */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-        struct fs_struct *fs = current->fs;
-        struct dentry *dentry = nd->path.dentry;
-        int want_root = 0;
-        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-                want_root = 1;
-                spin_lock(&fs->lock);
-                if (nd->root.mnt != fs->root.mnt ||
-                                nd->root.dentry != fs->root.dentry)
-                        goto err_root;
-        }
-        spin_lock(&dentry->d_lock);
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
-                goto err;
-        BUG_ON(nd->inode != dentry->d_inode);
-        spin_unlock(&dentry->d_lock);
-        if (want_root) {
-                path_get(&nd->root);
-                spin_unlock(&fs->lock);
-        }
-        mntget(nd->path.mnt);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        nd->flags &= ~LOOKUP_RCU;
-        return 0;
-err:
-        spin_unlock(&dentry->d_lock);
-err_root:
-        if (want_root)
-                spin_unlock(&fs->lock);
-        return -ECHILD;
-}
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU)
-                return nameidata_drop_rcu(nd);
-        return 0;
-}
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * unlazy_walk - try to switch to ref-walk mode.
- * @nd: nameidata pathwalk data to drop
+ * @nd: nameidata pathwalk data
- * @dentry: dentry to drop
+ * @dentry: child of nd->path.dentry or NULL
 * Returns: 0 on success, -ECHILD on failure
 *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * @nd or NULL.  Must be called from rcu-walk context.
 */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
                        goto err_root;
        }
        spin_lock(&parent->d_lock);
-        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!dentry) {
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                if (!__d_rcu_to_refcount(parent, nd->seq))
-                goto err;
+                        goto err_parent;
-        /*
+                BUG_ON(nd->inode != parent->d_inode);
-         * If the sequence check on the child dentry passed, then the child has
+        } else {
-         * not been removed from its parent. This means the parent dentry must
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-         * be valid and able to take a reference at this point.
+                if (!__d_rcu_to_refcount(dentry, nd->seq))
-         */
+                        goto err_child;
-        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+                /*
-        BUG_ON(!parent->d_count);
+                 * If the sequence check on the child dentry passed, then
-        parent->d_count++;
+                 * the child has not been removed from its parent. This
-        spin_unlock(&dentry->d_lock);
+                 * means the parent dentry must be valid and able to take
+                 * a reference at this point.
+                 */
+                BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+                BUG_ON(!parent->d_count);
+                parent->d_count++;
+                spin_unlock(&dentry->d_lock);
+        }
        spin_unlock(&parent->d_lock);
        if (want_root) {
                path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        br_read_unlock(vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err:
+err_child:
        spin_unlock(&dentry->d_lock);
+err_parent:
        spin_unlock(&parent->d_lock);
 err_root:
        if (want_root)
@@ -510,59 +468,6 @@ err_root:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
-                        nd->flags &= ~LOOKUP_RCU;
-                        if (!(nd->flags & LOOKUP_ROOT))
-                                nd->root.mnt = NULL;
-                        rcu_read_unlock();
-                        br_read_unlock(vfsmount_lock);
-                        return -ECHILD;
-                }
-        }
-        return 0;
-}
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-        struct dentry *dentry = nd->path.dentry;
-        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        nd->flags &= ~LOOKUP_RCU;
-        if (!(nd->flags & LOOKUP_ROOT))
-                nd->root.mnt = NULL;
-        spin_lock(&dentry->d_lock);
-        if (!__d_rcu_to_refcount(dentry, nd->seq))
-                goto err_unlock;
-        BUG_ON(nd->inode != dentry->d_inode);
-        spin_unlock(&dentry->d_lock);
-        mntget(nd->path.mnt);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        return 0;
-err_unlock:
-        spin_unlock(&dentry->d_lock);
-        rcu_read_unlock();
-        br_read_unlock(vfsmount_lock);
-        return -ECHILD;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
-/*
+/**
- * handle_reval_path - force revalidation of a dentry
+ * complete_walk - successful completion of path walk
- *
+ * @nd:  pointer nameidata
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
 *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
- * either return the error returned by d_revalidate or -ESTALE if the
+ * Revalidate the final result, unless we'd already done that during
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
- * invalidate the dentry. It's up to the caller to handle putting references
+ * success, -error on failure.  In case of failure caller does not
- * to the path if necessary.
+ * need to drop nd->path.
 */
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
 {
        struct dentry *dentry = nd->path.dentry;
        int status;
+        if (nd->flags & LOOKUP_RCU) {
+                nd->flags &= ~LOOKUP_RCU;
+                if (!(nd->flags & LOOKUP_ROOT))
+                        nd->root.mnt = NULL;
+                spin_lock(&dentry->d_lock);
+                if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+                        spin_unlock(&dentry->d_lock);
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+                BUG_ON(nd->inode != dentry->d_inode);
+                spin_unlock(&dentry->d_lock);
+                mntget(nd->path.mnt);
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
        if (!status)
                status = -ESTALE;
+        path_put(&nd->path);
        return status;
 }
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
 unlazy:
-                if (dentry) {
+                if (unlazy_walk(nd, dentry))
-                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return -ECHILD;
-                                return -ECHILD;
-                } else {
-                        if (nameidata_drop_rcu(nd))
-                                return -ECHILD;
-                }
        } else {
                dentry = __d_lookup(parent, name);
        }
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
                if (err != -ECHILD)
                        return err;
-                if (nameidata_drop_rcu(nd))
+                if (unlazy_walk(nd, NULL))
                        return -ECHILD;
        }
        return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                return -ENOENT;
        }
        if (unlikely(inode->i_op->follow_link) && follow) {
-                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                if (nd->flags & LOOKUP_RCU) {
-                        return -ECHILD;
+                        if (unlikely(unlazy_walk(nd, path->dentry))) {
+                                terminate_walk(nd);
+                                return -ECHILD;
+                        }
+                }
                BUG_ON(inode != path->dentry->d_inode);
                return 1;
        }
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
                }
        }
-        if (nd->flags & LOOKUP_RCU) {
+        if (!err)
-                /* went all way through without dropping RCU */
+                err = complete_walk(nd);
-                BUG_ON(err);
-                if (nameidata_drop_rcu_last(nd))
-                        err = -ECHILD;
-        }
-        if (!err) {
-                err = handle_reval_path(nd);
-                if (err)
-                        path_put(&nd->path);
-        }
        if (!err && nd->flags & LOOKUP_DIRECTORY) {
                if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        return ERR_PTR(error);
                /* fallthrough */
        case LAST_ROOT:
-                if (nd->flags & LOOKUP_RCU) {
+                error = complete_walk(nd);
-                        if (nameidata_drop_rcu_last(nd))
-                                return ERR_PTR(-ECHILD);
-                }
-                error = handle_reval_path(nd);
                if (error)
-                        goto exit;
+                        return ERR_PTR(error);
                audit_inode(pathname, nd->path.dentry);
                if (open_flag & O_CREAT) {
                        error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                goto ok;
        case LAST_BIND:
-                /* can't be RCU mode here */
+                error = complete_walk(nd);
-                error = handle_reval_path(nd);
                if (error)
-                        goto exit;
+                        return ERR_PTR(error);
                audit_inode(pathname, dir);
                goto ok;
        }
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                if (error) /* symlink */
                        return NULL;
                /* sayonara */
-                if (nd->flags & LOOKUP_RCU) {
+                error = complete_walk(nd);
-                        if (nameidata_drop_rcu_last(nd))
+                if (error)
-                                return ERR_PTR(-ECHILD);
+                        return ERR_PTR(-ECHILD);
-                }
                error = -ENOTDIR;
                if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
        /* create side of things */
+        error = complete_walk(nd);
-        if (nd->flags & LOOKUP_RCU) {
+        if (error)
-                if (nameidata_drop_rcu_last(nd))
+                return ERR_PTR(error);
-                        return ERR_PTR(-ECHILD);
-        }
        audit_inode(pathname, dir);
        error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 /*
- * We try to drop the dentry early: we should have
+ * The dentry_unhash() helper will try to drop the dentry early: we
- * a usage count of 2 if we're the only user of this
+ * should have a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
+ * dentry, and if that is true (possibly after pruning the dcache),
- * the dcache), then we drop the dentry now.
+ * then we drop the dentry now.
 *
 * A low-level filesystem can, if it choses, legally
 * do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 */
 void dentry_unhash(struct dentry *dentry)
 {
-        dget(dentry);
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
-        if (dentry->d_count == 2)
+        if (dentry->d_count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
        mutex_lock(&dentry->d_inode->i_mutex);
-        dentry_unhash(dentry);
+        error = -EBUSY;
        if (d_mountpoint(dentry))
-                error = -EBUSY;
+                goto out;
-        else {
-                error = security_inode_rmdir(dir, dentry);
+        error = security_inode_rmdir(dir, dentry);
-                if (!error) {
+        if (error)
-                        error = dir->i_op->rmdir(dir, dentry);
+                goto out;
-                        if (!error) {
-                                dentry->d_inode->i_flags |= S_DEAD;
+        error = dir->i_op->rmdir(dir, dentry);
-                                dont_mount(dentry);
+        if (error)
-                        }
+                goto out;
-                }
-        }
+        dentry->d_inode->i_flags |= S_DEAD;
+        dont_mount(dentry);
+out:
        mutex_unlock(&dentry->d_inode->i_mutex);
-        if (!error) {
+        if (!error)
                d_delete(dentry);
-        }
-        dput(dentry);
        return error;
 }
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 *         HOWEVER, it relies on the assumption that any object with ->lookup()
 *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *         we'd better make sure that there's no link(2) for them.
- *      d) some filesystems don't support opened-but-unlinked directories,
+ *      d) conversion from fhandle to dentry may come in the wrong moment - when
- *         either because of layout or because they are not ready to deal with
- *         all cases correctly. The latter will be fixed (taking this sort of
- *         stuff into VFS), but the former is not going away. Solution: the same
- *         trick as in rmdir().
- *      e) conversion from fhandle to dentry may come in the wrong moment - when
 *         we are removing the target. Solution: we will have to grab ->i_mutex
 *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *         ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
 {
        int error = 0;
-        struct inode *target;
+        struct inode *target = new_dentry->d_inode;
        /*
         * If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
-        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-                error = -EBUSY;
+        error = -EBUSY;
-        else {
+        if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
-                if (target)
+                goto out;
-                        dentry_unhash(new_dentry);
-                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-        }
+        if (error)
+                goto out;
        if (target) {
-                if (!error) {
+                target->i_flags |= S_DEAD;
-                        target->i_flags |= S_DEAD;
+                dont_mount(new_dentry);
-                        dont_mount(new_dentry);
-                }
-                mutex_unlock(&target->i_mutex);
-                if (d_unhashed(new_dentry))
-                        d_rehash(new_dentry);
-                dput(new_dentry);
        }
+out:
+        if (target)
+                mutex_unlock(&target->i_mutex);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct inode *target;
+        struct inode *target = new_dentry->d_inode;
        int error;
        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        dget(new_dentry);
-        target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
+        error = -EBUSY;
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-                error = -EBUSY;
+                goto out;
-        else
-                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-        if (!error) {
+        if (error)
-                if (target)
+                goto out;
-                        dont_mount(new_dentry);
-                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+        if (target)
-                        d_move(old_dentry, new_dentry);
+                dont_mount(new_dentry);
-        }
+        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+                d_move(old_dentry, new_dentry);
+out:
        if (target)
                mutex_unlock(&target->i_mutex);
        dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 static int flags_to_propagation_type(int flags)
 {
-        int type = flags & ~MS_REC;
+        int type = flags & ~(MS_REC | MS_SILENT);
        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        DPRINTK("ncp_rmdir: removing %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
+        dentry_unhash(dentry);
        error = -EBUSY;
        if (!d_unhashed(dentry))
                goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct nilfs_transaction_info ti;
        int err;
+        dentry_unhash(dentry);
        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
        if (err)
                return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct nilfs_transaction_info ti;
        int err;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
        if (unlikely(err))
                return err;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
        namei.o                 \
        refcounttree.o          \
        reservations.o          \
+        move_extents.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7401c7..ed553c60de82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #include <cluster/masklog.h>
@@ -7184,3 +7185,168 @@ out_commit:
 out:
        return ret;
 }
+static int ocfs2_trim_extent(struct super_block *sb,
+                             struct ocfs2_group_desc *gd,
+                             u32 start, u32 count)
+{
+        u64 discard, bcount;
+        bcount = ocfs2_clusters_to_blocks(sb, count);
+        discard = le64_to_cpu(gd->bg_blkno) +
+                        ocfs2_clusters_to_blocks(sb, start);
+        trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+        return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+static int ocfs2_trim_group(struct super_block *sb,
+                            struct ocfs2_group_desc *gd,
+                            u32 start, u32 max, u32 minbits)
+{
+        int ret = 0, count = 0, next;
+        void *bitmap = gd->bg_bitmap;
+        if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+                return 0;
+        trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+                               start, max, minbits);
+        while (start < max) {
+                start = ocfs2_find_next_zero_bit(bitmap, max, start);
+                if (start >= max)
+                        break;
+                next = ocfs2_find_next_bit(bitmap, max, start);
+                if ((next - start) >= minbits) {
+                        ret = ocfs2_trim_extent(sb, gd,
+                                                start, next - start);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                        count += next - start;
+                }
+                start = next + 1;
+                if (fatal_signal_pending(current)) {
+                        count = -ERESTARTSYS;
+                        break;
+                }
+                if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+                        break;
+        }
+        if (ret < 0)
+                count = ret;
+        return count;
+}
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u64 start, len, trimmed, first_group, last_group, group;
+        int ret, cnt;
+        u32 first_bit, last_bit, minlen;
+        struct buffer_head *main_bm_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_dinode *main_bm;
+        struct ocfs2_group_desc *gd = NULL;
+        start = range->start >> osb->s_clustersize_bits;
+        len = range->len >> osb->s_clustersize_bits;
+        minlen = range->minlen >> osb->s_clustersize_bits;
+        trimmed = 0;
+        if (!len) {
+                range->len = 0;
+                return 0;
+        }
+        if (minlen >= osb->bitmap_cpg)
+                return -EINVAL;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (start >= le32_to_cpu(main_bm->i_clusters)) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (start + len > le32_to_cpu(main_bm->i_clusters))
+                len = le32_to_cpu(main_bm->i_clusters) - start;
+        trace_ocfs2_trim_fs(start, len, minlen);
+        /* Determine first and last group to examine based on start and len */
+        first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+        if (first_group == osb->first_cluster_group_blkno)
+                first_bit = start;
+        else
+                first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+        last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+        last_bit = osb->bitmap_cpg;
+        for (group = first_group; group <= last_group;) {
+                if (first_bit + len >= osb->bitmap_cpg)
+                        last_bit = osb->bitmap_cpg;
+                else
+                        last_bit = first_bit + len;
+                ret = ocfs2_read_group_descriptor(main_bm_inode,
+                                                  main_bm, group,
+                                                  &gd_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+                gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+                cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+                brelse(gd_bh);
+                gd_bh = NULL;
+                if (cnt < 0) {
+                        ret = cnt;
+                        mlog_errno(ret);
+                        break;
+                }
+                trimmed += cnt;
+                len -= osb->bitmap_cpg - first_bit;
+                first_bit = 0;
+                if (group == osb->first_cluster_group_blkno)
+                        group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+                else
+                        group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+        }
+        range->len = trimmed * sb->s_blocksize;
+out_unlock:
+        ocfs2_inode_unlock(main_bm_inode, 0);
+        brelse(main_bm_bh);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a03251c..ca381c584127 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
                    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
 * Helper function to look at the # of clusters in an extent record.
 */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1f..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-        sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
-        /*
-         * Create this symlink for backwards compatibility with old
-         * versions of ocfs2-tools which look for things in /sys/o2cb.
-         */
-        ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
-        if (ret)
-                goto error;
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..d602abb51b61 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
        wait_queue_head_t dlm_join_events;
        unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
        return 1;
 }
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+        if (idx == DLM_GRANTED_LIST)
+                return "granted";
+        else if (idx == DLM_CONVERTING_LIST)
+                return "converting";
+        else if (idx == DLM_BLOCKED_LIST)
+                return "blocked";
+        else
+                return "unknown";
+}
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -448,6 +461,7 @@ enum {
        DLM_FINALIZE_RECO_MSG           = 518,
        DLM_QUERY_REGION                = 519,
        DLM_QUERY_NODEINFO              = 520,
+        DLM_BEGIN_EXIT_DOMAIN_MSG       = 521,
 };
 struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..56f82cb912e3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
                                 buf + out, len - out);
        out += snprintf(buf + out, len - out, "\n");
+        /* Exit Domain Map: xx xx xx */
+        out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+        out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
        /* Live Map: xx xx xx */
        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6cbde0..6ed6b95dcf93 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 * New in version 1.1:
 *      - Message DLM_QUERY_REGION added to support global heartbeat
 *      - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ *      - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
 */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-        .pv_minor = 1,
+        .pv_minor = 2,
 };
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
                        dropped = dlm_empty_lockres(dlm, res);
                        spin_lock(&res->spinlock);
-                        __dlm_lockres_calc_usage(dlm, res);
+                        if (dropped)
-                        iter = res->hash_node.next;
+                                __dlm_lockres_calc_usage(dlm, res);
+                        else
+                                iter = res->hash_node.next;
                        spin_unlock(&res->spinlock);
                        dlm_lockres_put(res);
-                        if (dropped)
+                        if (dropped) {
+                                cond_resched_lock(&dlm->spinlock);
                                goto redo_bucket;
+                        }
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
        return ret;
 }
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+                                         void *data, void **ret_data)
+{
+        struct dlm_ctxt *dlm = data;
+        unsigned int node;
+        struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+        if (!dlm_grab(dlm))
+                return 0;
+        node = exit_msg->node_idx;
+        mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+        spin_lock(&dlm->spinlock);
+        set_bit(node, dlm->exit_domain_map);
+        spin_unlock(&dlm->spinlock);
+        dlm_put(dlm);
+        return 0;
+}
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
        /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
+        clear_bit(node, dlm->exit_domain_map);
        __dlm_print_nodes(dlm);
        /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        return 0;
 }
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
                                    unsigned int node)
 {
        int status;
        struct dlm_exit_domain leave_msg;
-        mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+        mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
-                  node, dlm->name, dlm->node_num);
+             msg_type, node);
        memset(&leave_msg, 0, sizeof(leave_msg));
        leave_msg.node_idx = dlm->node_num;
-        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+        status = o2net_send_message(msg_type, dlm->key, &leave_msg,
-                                    &leave_msg, sizeof(leave_msg), node,
+                                    sizeof(leave_msg), node, NULL);
-                                    NULL);
        if (status < 0)
-                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                mlog(ML_ERROR, "Error %d sending domain exit message %u "
-                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
+                     "to node %u on domain %s\n", status, msg_type, node,
-        mlog(0, "status return %d from o2net_send_message\n", status);
+                     dlm->name);
        return status;
 }
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+        int node = -1;
+        /* Support for begin exit domain was added in 1.2 */
+        if (dlm->dlm_locking_proto.pv_major == 1 &&
+            dlm->dlm_locking_proto.pv_minor < 2)
+                return;
+        /*
+         * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+         * informational. Meaning if a node does not receive the message,
+         * so be it.
+         */
+        spin_lock(&dlm->spinlock);
+        while (1) {
+                node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+                if (node >= O2NM_MAX_NODES)
+                        break;
+                if (node == dlm->node_num)
+                        continue;
+                spin_unlock(&dlm->spinlock);
+                dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+                spin_lock(&dlm->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+}
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
                clear_node = 1;
-                status = dlm_send_one_domain_exit(dlm, node);
+                status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+                                                  node);
                if (status < 0 &&
                    status != -ENOPROTOOPT &&
                    status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
        if (leave) {
                mlog(0, "shutting down domain %s\n", dlm->name);
+                dlm_begin_exit_domain(dlm);
                /* We changed dlm state, notify the thread */
                dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                 * leftover join state. */
                BUG_ON(dlm->joining_node != assert->node_idx);
                set_bit(assert->node_idx, dlm->domain_map);
+                clear_bit(assert->node_idx, dlm->exit_domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        if (status)
                goto bail;
+        status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+                                        sizeof(struct dlm_exit_domain),
+                                        dlm_begin_exit_domain_handler,
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
+        if (status)
+                goto bail;
 bail:
        if (status)
                dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d166328cf7..11eefb8c12e9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_put(res);
 }
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
+/*
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-                                      struct dlm_lock_resource *res,
+                                      struct dlm_lock_resource *res)
-                                      int *numlocks,
-                                      int *hasrefs)
 {
-        int ret;
+        enum dlm_lockres_list idx;
-        int i;
+        int nonlocal = 0, node_ref;
-        int count = 0;
        struct list_head *queue;
        struct dlm_lock *lock;
+        u64 cookie;
        assert_spin_locked(&res->spinlock);
-        *numlocks = 0;
+        if (res->owner != dlm->node_num)
-        *hasrefs = 0;
+                return 0;
-        ret = -EINVAL;
-        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
-                mlog(0, "cannot migrate lockres with unknown owner!\n");
-                goto leave;
-        }
-        if (res->owner != dlm->node_num) {
-                mlog(0, "cannot migrate lockres this node doesn't own!\n");
-                goto leave;
-        }
-        ret = 0;
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
-        queue = &res->granted;
+                queue = dlm_list_idx_to_ptr(res, idx);
-        for (i = 0; i < 3; i++) {
                list_for_each_entry(lock, queue, list) {
-                        ++count;
+                        if (lock->ml.node != dlm->node_num) {
-                        if (lock->ml.node == dlm->node_num) {
+                                nonlocal++;
-                                mlog(0, "found a lock owned by this node still "
+                                continue;
-                                     "on the %s queue!  will not migrate this "
-                                     "lockres\n", (i == 0 ? "granted" :
-                                                   (i == 1 ? "converting" :
-                                                    "blocked")));
-                                ret = -ENOTEMPTY;
-                                goto leave;
                        }
+                        cookie = be64_to_cpu(lock->ml.cookie);
+                        mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+                             "%s list\n", dlm->name, res->lockname.len,
+                             res->lockname.name,
+                             dlm_get_lock_cookie_node(cookie),
+                             dlm_get_lock_cookie_seq(cookie),
+                             dlm_list_in_text(idx));
+                        return 0;
                }
-                queue++;
        }
-        *numlocks = count;
+        if (!nonlocal) {
+                node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+                if (node_ref >= O2NM_MAX_NODES)
-        if (count < O2NM_MAX_NODES)
+                        return 0;
-                *hasrefs = 1;
+        }
-        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+        mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
-             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+             res->lockname.name);
-leave:
+        return 1;
-        return ret;
 }
 /*
@@ -2406,8 +2396,7 @@ leave:
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                               struct dlm_lock_resource *res,
+                               struct dlm_lock_resource *res, u8 target)
-                               u8 target)
 {
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
                return -EINVAL;
+        BUG_ON(target == O2NM_MAX_NODES);
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+             target);
-        /*
-         * ensure this lockres is a proper candidate for migration
-         */
-        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret < 0) {
-                spin_unlock(&res->spinlock);
-                goto leave;
-        }
-        spin_unlock(&res->spinlock);
-        /* no work to do */
-        if (numlocks == 0 && !hasrefs)
-                goto leave;
-        /*
-         * preallocate up front
-         * if this fails, abort
-         */
+        /* preallocate up front. if this fails, abort */
        ret = -ENOMEM;
        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        ret = 0;
        /*
-         * find a node to migrate the lockres to
-         */
-        spin_lock(&dlm->spinlock);
-        /* pick a new node */
-        if (!test_bit(target, dlm->domain_map) ||
-            target >= O2NM_MAX_NODES) {
-                target = dlm_pick_migration_target(dlm, res);
-        }
-        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
-             namelen, name, target);
-        if (target >= O2NM_MAX_NODES ||
-            !test_bit(target, dlm->domain_map)) {
-                /* target chosen is not alive */
-                ret = -EINVAL;
-        }
-        if (ret) {
-                spin_unlock(&dlm->spinlock);
-                goto fail;
-        }
-        mlog(0, "continuing with target = %u\n", target);
-        /*
         * clear any existing master requests and
         * add the migration mle to the list
         */
+        spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
                                    namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
                        dlm_put_mle(mle);
                } else if (mle) {
                        kmem_cache_free(dlm_mle_cache, mle);
+                        mle = NULL;
                }
                goto leave;
        }
@@ -2652,69 +2600,52 @@ leave:
        if (wake)
                wake_up(&res->wq);
-        /* TODO: cleanup */
        if (mres)
                free_page((unsigned long)mres);
        dlm_put(dlm);
-        mlog(0, "returning %d\n", ret);
+        mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+             name, target, ret);
        return ret;
 }
 #define DLM_MIGRATION_RETRY_MS  100
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
 * There should not be any remaining locks on nonlocal lock resources,
 * and there should be no local locks left on locally mastered resources.
 *
 * Called with the dlm spinlock held, may drop it to do migration, but
 * will re-acquire before exit.
 *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks, hasrefs;
+        u8 target = O2NM_MAX_NODES;
+        assert_spin_locked(&dlm->spinlock);
        spin_lock(&res->spinlock);
-        if (res->owner != dlm->node_num) {
+        if (dlm_is_lockres_migrateable(dlm, res))
-                if (!__dlm_lockres_unused(res)) {
+                target = dlm_pick_migration_target(dlm, res);
-                        mlog(ML_ERROR, "%s:%.*s: this node is not master, "
+        spin_unlock(&res->spinlock);
-                             "trying to free this but locks remain\n",
-                             dlm->name, res->lockname.len, res->lockname.name);
-                }
-                spin_unlock(&res->spinlock);
-                goto leave;
-        }
-        /* No need to migrate a lockres having no locks */
+        if (target == O2NM_MAX_NODES)
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0 && !hasrefs) {
-                spin_unlock(&res->spinlock);
                goto leave;
-        }
-        spin_unlock(&res->spinlock);
        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
        spin_unlock(&dlm->spinlock);
        lock_dropped = 1;
-        while (1) {
+        ret = dlm_migrate_lockres(dlm, res, target);
-                ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
+        if (ret)
-                if (ret >= 0)
+                mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
-                        break;
+                     dlm->name, res->lockname.len, res->lockname.name,
-                if (ret == -ENOTEMPTY) {
+                     target, ret);
-                        mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-                                res->lockname.len, res->lockname.name);
-                        BUG();
-                }
-                mlog(0, "lockres %.*s: migrate failed, "
-                     "retrying\n", res->lockname.len,
-                     res->lockname.name);
-                msleep(DLM_MIGRATION_RETRY_MS);
-        }
        spin_lock(&dlm->spinlock);
 leave:
        return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
        }
 }
-/* for now this is not too intelligent.  we will
+/*
- * need stats to make this do the right thing.
+ * Pick a node to migrate the lock resource to. This function selects a
- * this just finds the first lock on one of the
+ * potential target based first on the locks and then on refmap. It skips
- * queues and uses that node as the target. */
+ * nodes that are in the process of exiting the domain.
+ */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                                    struct dlm_lock_resource *res)
 {
-        int i;
+        enum dlm_lockres_list idx;
        struct list_head *queue = &res->granted;
        struct dlm_lock *lock;
-        int nodenum;
+        int noderef;
+        u8 nodenum = O2NM_MAX_NODES;
        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&res->spinlock);
-        spin_lock(&res->spinlock);
+        /* Go through all the locks */
-        for (i=0; i<3; i++) {
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+                queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                        /* up to the caller to make sure this node
+                        if (lock->ml.node == dlm->node_num)
-                         * is alive */
+                                continue;
-                        if (lock->ml.node != dlm->node_num) {
+                        if (test_bit(lock->ml.node, dlm->exit_domain_map))
-                                spin_unlock(&res->spinlock);
+                                continue;
-                                return lock->ml.node;
+                        nodenum = lock->ml.node;
-                        }
+                        goto bail;
                }
-                queue++;
-        }
-        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        if (nodenum < O2NM_MAX_NODES) {
-                spin_unlock(&res->spinlock);
-                return nodenum;
        }
-        spin_unlock(&res->spinlock);
-        mlog(0, "have not found a suitable target yet! checking domain map\n");
-        /* ok now we're getting desperate.  pick anyone alive. */
+        /* Go thru the refmap */
-        nodenum = -1;
+        noderef = -1;
        while (1) {
-                nodenum = find_next_bit(dlm->domain_map,
+                noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
-                                        O2NM_MAX_NODES, nodenum+1);
+                                        noderef + 1);
-                mlog(0, "found %d in domain map\n", nodenum);
+                if (noderef >= O2NM_MAX_NODES)
-                if (nodenum >= O2NM_MAX_NODES)
                        break;
-                if (nodenum != dlm->node_num) {
+                if (noderef == dlm->node_num)
-                        mlog(0, "picking %d\n", nodenum);
+                        continue;
-                        return nodenum;
+                if (test_bit(noderef, dlm->exit_domain_map))
-                }
+                        continue;
+                nodenum = noderef;
+                goto bail;
        }
-        mlog(0, "giving up.  no master to migrate to\n");
+bail:
-        return DLM_LOCK_RES_OWNER_UNKNOWN;
+        return nodenum;
 }
 /* this is called by the new master once all lockres
 * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6fc254d..7efab6d28a21 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
        mlog(0, "node %u being removed from domain map!\n", idx);
        clear_bit(idx, dlm->domain_map);
+        clear_bit(idx, dlm->exit_domain_map);
        /* wake up migration waiters if a node goes down.
         * perhaps later we can genericize this for other waiters. */
        wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0eddc365..b42076797049 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
 *                signifies a bast fired on the lock.
 */
 #define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
                                        struct kernel_param *kp)
 {
        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6dc206..b1e35a392ca5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+        .fallocate      = ocfs2_fallocate,
 };
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c5989eae..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
 #include <linux/ext2_fs.h>
@@ -35,31 +40,27 @@
 * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
 * just a best-effort to tell userspace that this request caused the error.
 */
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
 {
        kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
        (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
-#define o2info_set_request_error(a, b) \
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
-#define o2info_set_request_filled(a) \
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
-                __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
-#define o2info_clear_request_filled(a) \
+static inline int o2info_coherent(struct ocfs2_info_request *req)
-                __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+{
+        return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        oib.ib_blocksize = inode->i_sb->s_blocksize;
-        o2info_set_request_filled(oib);
+        o2info_set_request_filled(&oib.ib_req);
        if (o2info_to_user(oib, req))
                goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oib, req);
+                o2info_set_request_error(&oib.ib_req, req);
        return status;
 }
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        oic.ic_clustersize = osb->s_clustersize;
-        o2info_set_request_filled(oic);
+        o2info_set_request_filled(&oic.ic_req);
        if (o2info_to_user(oic, req))
                goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oic, req);
+                o2info_set_request_error(&oic.ic_req, req);
        return status;
 }
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        oim.im_max_slots = osb->max_slots;
-        o2info_set_request_filled(oim);
+        o2info_set_request_filled(&oim.im_req);
        if (o2info_to_user(oim, req))
                goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oim, req);
+                o2info_set_request_error(&oim.im_req, req);
        return status;
 }
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
-        o2info_set_request_filled(oil);
+        o2info_set_request_filled(&oil.il_req);
        if (o2info_to_user(oil, req))
                goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oil, req);
+                o2info_set_request_error(&oil.il_req, req);
        return status;
 }
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
-        o2info_set_request_filled(oiu);
+        o2info_set_request_filled(&oiu.iu_req);
        if (o2info_to_user(oiu, req))
                goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oiu, req);
+                o2info_set_request_error(&oiu.iu_req, req);
        return status;
 }
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        oif.if_incompat_features = osb->s_feature_incompat;
        oif.if_ro_compat_features = osb->s_feature_ro_compat;
-        o2info_set_request_filled(oif);
+        o2info_set_request_filled(&oif.if_req);
        if (o2info_to_user(oif, req))
                goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oif, req);
+                o2info_set_request_error(&oif.if_req, req);
        return status;
 }
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        oij.ij_journal_size = osb->journal->j_inode->i_size;
-        o2info_set_request_filled(oij);
+        o2info_set_request_filled(&oij.ij_req);
        if (o2info_to_user(oij, req))
                goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oij, req);
+                o2info_set_request_error(&oij.ij_req, req);
+        return status;
+}
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+                                struct inode *inode_alloc, u64 blkno,
+                                struct ocfs2_info_freeinode *fi, u32 slot)
+{
+        int status = 0, unlock = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *dinode_alloc = NULL;
+        if (inode_alloc)
+                mutex_lock(&inode_alloc->i_mutex);
+        if (o2info_coherent(&fi->ifi_req)) {
+                status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+        fi->ifi_stat[slot].lfi_total =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+        fi->ifi_stat[slot].lfi_free =
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+                le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(inode_alloc, 0);
+        if (inode_alloc)
+                mutex_unlock(&inode_alloc->i_mutex);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freeinode(struct inode *inode,
+                                struct ocfs2_info_request __user *req)
+{
+        u32 i;
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+        struct ocfs2_info_freeinode *oifi = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *inode_alloc = NULL;
+        oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+        if (!oifi) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oifi, req))
+                goto bail;
+        oifi->ifi_slotnum = osb->max_slots;
+        for (i = 0; i < oifi->ifi_slotnum; i++) {
+                if (o2info_coherent(&oifi->ifi_req)) {
+                        inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+                        if (!inode_alloc) {
+                                mlog(ML_ERROR, "unable to get alloc inode in "
+                                    "slot %u\n", i);
+                                status = -EIO;
+                                goto bail;
+                        }
+                } else {
+                        ocfs2_sprintf_system_inode_name(namebuf,
+                                                        sizeof(namebuf),
+                                                        type, i);
+                        status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                            namebuf,
+                                                            strlen(namebuf),
+                                                            &blkno);
+                        if (status < 0) {
+                                status = -ENOENT;
+                                goto bail;
+                        }
+                }
+                status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+                if (status < 0)
+                        goto bail;
+                iput(inode_alloc);
+                inode_alloc = NULL;
+        }
+        o2info_set_request_filled(&oifi->ifi_req);
+        if (o2info_to_user(*oifi, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oifi->ifi_req, req);
+        kfree(oifi);
+        return status;
+}
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+                                   unsigned int chunksize)
+{
+        int index;
+        index = __ilog2_u32(chunksize);
+        if (index >= OCFS2_INFO_MAX_HIST)
+                index = OCFS2_INFO_MAX_HIST - 1;
+        hist->fc_chunks[index]++;
+        hist->fc_clusters[index] += chunksize;
+}
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+                               unsigned int chunksize)
+{
+        if (chunksize > stats->ffs_max)
+                stats->ffs_max = chunksize;
+        if (chunksize < stats->ffs_min)
+                stats->ffs_min = chunksize;
+        stats->ffs_avg += chunksize;
+        stats->ffs_free_chunks_real++;
+}
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+                           unsigned int chunksize)
+{
+        o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+        o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+                                   struct inode *gb_inode,
+                                   struct ocfs2_dinode *gb_dinode,
+                                   struct ocfs2_chain_rec *rec,
+                                   struct ocfs2_info_freefrag *ffg,
+                                   u32 chunks_in_group)
+{
+        int status = 0, used;
+        u64 blkno;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_group_desc *bg = NULL;
+        unsigned int max_bits, num_clusters;
+        unsigned int offset = 0, cluster, chunk;
+        unsigned int chunk_free, last_chunksize = 0;
+        if (!le32_to_cpu(rec->c_free))
+                goto bail;
+        do {
+                if (!bg)
+                        blkno = le64_to_cpu(rec->c_blkno);
+                else
+                        blkno = le64_to_cpu(bg->bg_next_group);
+                if (bh) {
+                        brelse(bh);
+                        bh = NULL;
+                }
+                if (o2info_coherent(&ffg->iff_req))
+                        status = ocfs2_read_group_descriptor(gb_inode,
+                                                             gb_dinode,
+                                                             blkno, &bh);
+                else
+                        status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog(ML_ERROR, "Can't read the group descriptor # "
+                             "%llu from device.", (unsigned long long)blkno);
+                        status = -EIO;
+                        goto bail;
+                }
+                bg = (struct ocfs2_group_desc *)bh->b_data;
+                if (!le16_to_cpu(bg->bg_free_bits_count))
+                        continue;
+                max_bits = le16_to_cpu(bg->bg_bits);
+                offset = 0;
+                for (chunk = 0; chunk < chunks_in_group; chunk++) {
+                        /*
+                         * last chunk may be not an entire one.
+                         */
+                        if ((offset + ffg->iff_chunksize) > max_bits)
+                                num_clusters = max_bits - offset;
+                        else
+                                num_clusters = ffg->iff_chunksize;
+                        chunk_free = 0;
+                        for (cluster = 0; cluster < num_clusters; cluster++) {
+                                used = ocfs2_test_bit(offset,
+                                                (unsigned long *)bg->bg_bitmap);
+                                /*
+                                 * - chunk_free counts free clusters in #N chunk.
+                                 * - last_chunksize records the size(in) clusters
+                                 *   for the last real free chunk being counted.
+                                 */
+                                if (!used) {
+                                        last_chunksize++;
+                                        chunk_free++;
+                                }
+                                if (used && last_chunksize) {
+                                        ocfs2_info_update_ffg(ffg,
+                                                              last_chunksize);
+                                        last_chunksize = 0;
+                                }
+                                offset++;
+                        }
+                        if (chunk_free == ffg->iff_chunksize)
+                                ffg->iff_ffs.ffs_free_chunks++;
+                }
+                /*
+                 * need to update the info for last free chunk.
+                 */
+                if (last_chunksize)
+                        ocfs2_info_update_ffg(ffg, last_chunksize);
+        } while (le64_to_cpu(bg->bg_next_group));
+bail:
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+                                    struct inode *gb_inode, u64 blkno,
+                                    struct ocfs2_info_freefrag *ffg)
+{
+        u32 chunks_in_group;
+        int status = 0, unlock = 0, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_chain_list *cl = NULL;
+        struct ocfs2_chain_rec *rec = NULL;
+        struct ocfs2_dinode *gb_dinode = NULL;
+        if (gb_inode)
+                mutex_lock(&gb_inode->i_mutex);
+        if (o2info_coherent(&ffg->iff_req)) {
+                status = ocfs2_inode_lock(gb_inode, &bh, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                unlock = 1;
+        } else {
+                status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+        cl = &(gb_dinode->id2.i_chain);
+        /*
+         * Chunksize(in) clusters from userspace should be
+         * less than clusters in a group.
+         */
+        if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+        ffg->iff_ffs.ffs_min = ~0U;
+        ffg->iff_ffs.ffs_clusters =
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+        ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+                        le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+        chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+                                                        gb_dinode,
+                                                        rec, ffg,
+                                                        chunks_in_group);
+                if (status)
+                        goto bail;
+        }
+        if (ffg->iff_ffs.ffs_free_chunks_real)
+                ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+                                        ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+        if (unlock)
+                ocfs2_inode_unlock(gb_inode, 0);
+        if (gb_inode)
+                mutex_unlock(&gb_inode->i_mutex);
+        if (gb_inode)
+                iput(gb_inode);
+        brelse(bh);
+        return status;
+}
+int ocfs2_info_handle_freefrag(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+        u64 blkno = -1;
+        char namebuf[40];
+        int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+        struct ocfs2_info_freefrag *oiff;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *gb_inode = NULL;
+        oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+        if (!oiff) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        if (o2info_from_user(*oiff, req))
+                goto bail;
+        /*
+         * chunksize from userspace should be power of 2.
+         */
+        if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+            (!oiff->iff_chunksize)) {
+                status = -EINVAL;
+                goto bail;
+        }
+        if (o2info_coherent(&oiff->iff_req)) {
+                gb_inode = ocfs2_get_system_file_inode(osb, type,
+                                                       OCFS2_INVALID_SLOT);
+                if (!gb_inode) {
+                        mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                        status = -EIO;
+                        goto bail;
+                }
+        } else {
+                ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+                                                OCFS2_INVALID_SLOT);
+                status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                    namebuf,
+                                                    strlen(namebuf),
+                                                    &blkno);
+                if (status < 0) {
+                        status = -ENOENT;
+                        goto bail;
+                }
+        }
+        status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+        if (status < 0)
+                goto bail;
+        o2info_set_request_filled(&oiff->iff_req);
+        if (o2info_to_user(*oiff, req))
+                goto bail;
+        status = 0;
+bail:
+        if (status)
+                o2info_set_request_error(&oiff->iff_req, req);
+        kfree(oiff);
        return status;
 }
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        if (o2info_from_user(oir, req))
                goto bail;
-        o2info_clear_request_filled(oir);
+        o2info_clear_request_filled(&oir);
        if (o2info_to_user(oir, req))
                goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        status = 0;
 bail:
        if (status)
-                o2info_set_request_error(oir, req);
+                o2info_set_request_error(&oir, req);
        return status;
 }
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
                if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
                        status = ocfs2_info_handle_journal_size(inode, req);
                break;
+        case OCFS2_INFO_FREEINODE:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+                        status = ocfs2_info_handle_freeinode(inode, req);
+                break;
+        case OCFS2_INFO_FREEFRAG:
+                if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+                        status = ocfs2_info_handle_freefrag(inode, req);
+                break;
        default:
                status = ocfs2_info_handle_unknown(inode, req);
                break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        return -EFAULT;
                return ocfs2_info_handle(inode, &info, 0);
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ocfs2_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
+        case OCFS2_IOC_MOVE_EXT:
+                return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_GROUP_EXTEND:
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
+        case FITRIM:
                break;
        case OCFS2_IOC_REFLINK:
                if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                        return -EFAULT;
                return ocfs2_info_handle(inode, &info, 1);
+        case OCFS2_IOC_MOVE_EXT:
+                break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+struct ocfs2_move_extents_context {
+        struct inode *inode;
+        struct file *file;
+        int auto_defrag;
+        int partial;
+        int credits;
+        u32 new_phys_cpos;
+        u32 clusters_moved;
+        u64 refcount_loc;
+        struct ocfs2_move_extents *range;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+static int __ocfs2_move_extent(handle_t *handle,
+                               struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+                               int ext_flags)
+{
+        int ret = 0, index;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_extent_rec *rec, replace_rec;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el;
+        u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+        u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+        ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+                                               p_cpos, new_p_cpos, len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        memset(&replace_rec, 0, sizeof(replace_rec));
+        replace_rec.e_cpos = cpu_to_le32(cpos);
+        replace_rec.e_leaf_clusters = cpu_to_le16(len);
+        replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                                                                   new_p_cpos));
+        path = ocfs2_new_path_from_et(&context->et);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)ino, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        rec = &el->l_recs[index];
+        BUG_ON(ext_flags != rec->e_flags);
+        /*
+         * after moving/defraging to new location, the extent is not going
+         * to be refcounted anymore.
+         */
+        replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+                                      context->et.et_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_split_extent(handle, &context->et, path, index,
+                                 &replace_rec, context->meta_ac,
+                                 &context->dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_journal_dirty(handle, context->et.et_root_bh);
+        context->new_phys_cpos = new_p_cpos;
+        /*
+         * need I to append truncate log for old clusters?
+         */
+        if (old_blkno) {
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 old_blkno),
+                                        len, context->meta_ac,
+                                        &context->dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        old_blkno, len);
+        }
+out:
+        return ret;
+}
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+                                        struct ocfs2_extent_tree *et,
+                                        u32 clusters_to_move,
+                                        u32 extents_to_split,
+                                        struct ocfs2_alloc_context **meta_ac,
+                                        struct ocfs2_alloc_context **data_ac,
+                                        int extra_blocks,
+                                        int *credits)
+{
+        int ret, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (data_ac) {
+                ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+                                              clusters_to_move + 2);
+        mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+             extra_blocks, clusters_to_move, *credits);
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+        }
+        return ret;
+}
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+                               u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 new_phys_cpos, new_len;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        *len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+                                                 &context->meta_ac,
+                                                 &context->data_ac,
+                                                 extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * should be using allocation reservation strategy there?
+         *
+         * if (context->data_ac)
+         *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+         */
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_unlock_mutex;
+                }
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_mutex;
+        }
+        ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+                                     &new_phys_cpos, &new_len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * allowing partial extent moving is kind of 'pros and cons', it makes
+         * whole defragmentation less likely to fail, on the contrary, the bad
+         * thing is it may make the fs even more fragmented after moving, let
+         * userspace make a good decision here.
+         */
+        if (new_len != *len) {
+                mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+                if (!partial) {
+                        context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+                        ret = -ENOSPC;
+                        goto out_commit;
+                }
+        }
+        mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+             phys_cpos, new_phys_cpos);
+        ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+                                  new_phys_cpos, ext_flags);
+        if (ret)
+                mlog_errno(ret);
+        if (partial && (new_len != *len))
+                *len = new_len;
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock_mutex:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (context->data_ac) {
+                ocfs2_free_alloc_context(context->data_ac);
+                context->data_ac = NULL;
+        }
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+out:
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+                                         u64 vict_blkno,
+                                         int type, int slot,
+                                         int *vict_bit,
+                                         struct buffer_head **ret_bh)
+{
+        int ret, i, blocks_per_unit = 1;
+        u64 blkno;
+        char namebuf[40];
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+        struct ocfs2_chain_list *cl;
+        struct ocfs2_chain_rec *rec;
+        struct ocfs2_dinode *ac_dinode;
+        struct ocfs2_group_desc *bg;
+        ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+        ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+                                         strlen(namebuf), &blkno);
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+        cl = &(ac_dinode->id2.i_chain);
+        rec = &(cl->cl_recs[0]);
+        if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+                blocks_per_unit <<= (osb->s_clustersize_bits -
+                                                inode->i_sb->s_blocksize_bits);
+        /*
+         * 'vict_blkno' was out of the valid range.
+         */
+        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+                                blocks_per_unit))) {
+                ret = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+                rec = &(cl->cl_recs[i]);
+                if (!rec)
+                        continue;
+                bg = NULL;
+                do {
+                        if (!bg)
+                                blkno = le64_to_cpu(rec->c_blkno);
+                        else
+                                blkno = le64_to_cpu(bg->bg_next_group);
+                        if (gd_bh) {
+                                brelse(gd_bh);
+                                gd_bh = NULL;
+                        }
+                        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+                        if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+                                                le16_to_cpu(bg->bg_bits))) {
+                                *ret_bh = gd_bh;
+                                *vict_bit = (vict_blkno - blkno) /
+                                                        blocks_per_unit;
+                                mlog(0, "find the victim group: #%llu, "
+                                     "total_bits: %u, vict_bit: %u\n",
+                                     blkno, le16_to_cpu(bg->bg_bits),
+                                     *vict_bit);
+                                goto out;
+                        }
+                } while (le64_to_cpu(bg->bg_next_group));
+        }
+        ret = -EINVAL;
+out:
+        brelse(ac_bh);
+        /*
+         * caller has to release the gd_bh properly.
+         */
+        return ret;
+}
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+                                               struct ocfs2_move_extents *range)
+{
+        int ret, goal_bit = 0;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int c_to_b = 1 << (osb->s_clustersize_bits -
+                                        inode->i_sb->s_blocksize_bits);
+        /*
+         * validate goal sits within global_bitmap, and return the victim
+         * group desc
+         */
+        ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret)
+                goto out;
+        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+        /*
+         * make goal become cluster aligned.
+         */
+        if (range->me_goal % c_to_b)
+                range->me_goal = range->me_goal / c_to_b * c_to_b;
+        /*
+         * moving goal is not allowd to start with a group desc blok(#0 blk)
+         * let's compromise to the latter cluster.
+         */
+        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+                range->me_goal += c_to_b;
+        /*
+         * movement is not gonna cross two groups.
+         */
+        if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+                                                                range->me_len) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * more exact validations/adjustments will be performed later during
+         * moving operation for each extent range.
+         */
+        mlog(0, "extents get ready to be moved to #%llu block\n",
+             range->me_goal);
+out:
+        brelse(gd_bh);
+        return ret;
+}
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+                                    int *goal_bit, u32 move_len, u32 max_hop,
+                                    u32 *phys_cpos)
+{
+        int i, used, last_free_bits = 0, base_bit = *goal_bit;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                 le64_to_cpu(gd->bg_blkno));
+        for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+                used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+                if (used) {
+                        /*
+                         * we even tried searching the free chunk by jumping
+                         * a 'max_hop' distance, but still failed.
+                         */
+                        if ((i - base_bit) > max_hop) {
+                                *phys_cpos = 0;
+                                break;
+                        }
+                        if (last_free_bits)
+                                last_free_bits = 0;
+                        continue;
+                } else
+                        last_free_bits++;
+                if (last_free_bits == move_len) {
+                        *goal_bit = i;
+                        *phys_cpos = base_cpos + i;
+                        break;
+                }
+        }
+        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                                       handle_t *handle,
+                                       struct buffer_head *di_bh,
+                                       u32 num_bits,
+                                       u16 chain)
+{
+        int ret;
+        u32 tmp_used;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        struct ocfs2_chain_list *cl =
+                                (struct ocfs2_chain_list *) &di->id2.i_chain;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
+out:
+        return ret;
+}
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+                                             struct inode *alloc_inode,
+                                             struct ocfs2_group_desc *bg,
+                                             struct buffer_head *group_bh,
+                                             unsigned int bit_off,
+                                             unsigned int num_bits)
+{
+        int status;
+        void *bitmap = bg->bg_bitmap;
+        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        /* All callers get the descriptor via
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+             num_bits);
+        if (ocfs2_is_cluster_bitmap(alloc_inode))
+                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         group_bh,
+                                         journal_type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                            " count %u but claims %u are freed. num_bits %d",
+                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                            le16_to_cpu(bg->bg_bits),
+                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                return -EROFS;
+        }
+        while (num_bits--)
+                ocfs2_set_bit(bit_off++, bitmap);
+        ocfs2_journal_dirty(handle, group_bh);
+bail:
+        return status;
+}
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+                             u32 len, int ext_flags)
+{
+        int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct inode *gb_inode = NULL;
+        struct buffer_head *gb_bh = NULL;
+        struct buffer_head *gd_bh = NULL;
+        struct ocfs2_group_desc *gd;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    context->range->me_threshold);
+        u64 phys_blkno, new_phys_blkno;
+        phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                BUG_ON(!context->refcount_loc);
+                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                        context->refcount_loc,
+                                                        phys_blkno,
+                                                        len,
+                                                        &credits,
+                                                        &extra_blocks);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+                                                 &context->meta_ac,
+                                                 NULL, extra_blocks, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * need to count 2 extra credits for global_bitmap inode and
+         * group descriptor.
+         */
+        credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+        /*
+         * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+         * logic, while we still need to lock the global_bitmap.
+         */
+        gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+                                               OCFS2_INVALID_SLOT);
+        if (!gb_inode) {
+                mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                ret = -EIO;
+                goto out;
+        }
+        mutex_lock(&gb_inode->i_mutex);
+        ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock_gb_mutex;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock_tl_inode;
+        }
+        new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+        ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+                                            GLOBAL_BITMAP_SYSTEM_INODE,
+                                            OCFS2_INVALID_SLOT,
+                                            &goal_bit, &gd_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * probe the victim cluster group to find a proper
+         * region to fit wanted movement, it even will perfrom
+         * a best-effort attempt by compromising to a threshold
+         * around the goal.
+         */
+        ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+                                new_phys_cpos);
+        if (!new_phys_cpos) {
+                ret = -ENOSPC;
+                goto out_commit;
+        }
+        ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+                                  *new_phys_cpos, ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+        ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+                                               le16_to_cpu(gd->bg_chain));
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+                                         goal_bit, len);
+        if (ret)
+                mlog_errno(ret);
+        /*
+         * Here we should write the new page out first if we are
+         * in write-back mode.
+         */
+        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+        brelse(gd_bh);
+out_unlock_tl_inode:
+        mutex_unlock(&tl_inode->i_mutex);
+        ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+        mutex_unlock(&gb_inode->i_mutex);
+        brelse(gb_bh);
+        iput(gb_inode);
+out:
+        if (context->meta_ac) {
+                ocfs2_free_alloc_context(context->meta_ac);
+                context->meta_ac = NULL;
+        }
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+        return ret;
+}
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+                                         u32 threshold, int *skip)
+{
+        if ((*alloc_size + *len_defraged) < threshold) {
+                /*
+                 * proceed defragmentation until we meet the thresh
+                 */
+                *len_defraged += *alloc_size;
+        } else if (*len_defraged == 0) {
+                /*
+                 * XXX: skip a large extent.
+                 */
+                *skip = 1;
+        } else {
+                /*
+                 * split this extent to coalesce with former pieces as
+                 * to reach the threshold.
+                 *
+                 * we're done here with one cycle of defragmentation
+                 * in a size of 'thresh', resetting 'len_defraged'
+                 * forces a new defragmentation.
+                 */
+                *alloc_size = threshold - *len_defraged;
+                *len_defraged = 0;
+        }
+}
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+                                struct ocfs2_move_extents_context *context)
+{
+        int ret = 0, flags, do_defrag, skip = 0;
+        u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+        u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_move_extents *range = context->range;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if ((inode->i_size == 0) || (range->me_len == 0))
+                return 0;
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                return 0;
+        context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&context->dealloc);
+        /*
+         * TO-DO XXX:
+         *
+         * - xattr extents.
+         */
+        do_defrag = context->auto_defrag;
+        /*
+         * extents moving happens in unit of clusters, for the sake
+         * of simplicity, we may ignore two clusters where 'byte_start'
+         * and 'byte_start + len' were within.
+         */
+        move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+        len_to_move = (range->me_start + range->me_len) >>
+                                                osb->s_clustersize_bits;
+        if (len_to_move >= move_start)
+                len_to_move -= move_start;
+        else
+                len_to_move = 0;
+        if (do_defrag) {
+                defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+                if (defrag_thresh <= 1)
+                        goto done;
+        } else
+                new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                         range->me_goal);
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+             "thresh: %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (unsigned long long)range->me_start,
+             (unsigned long long)range->me_len,
+             move_start, len_to_move, defrag_thresh);
+        cpos = move_start;
+        while (len_to_move) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+                                         &flags);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > len_to_move)
+                        alloc_size = len_to_move;
+                /*
+                 * XXX: how to deal with a hole:
+                 *
+                 * - skip the hole of course
+                 * - force a new defragmentation
+                 */
+                if (!phys_cpos) {
+                        if (do_defrag)
+                                len_defraged = 0;
+                        goto next;
+                }
+                if (do_defrag) {
+                        ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+                                                     defrag_thresh, &skip);
+                        /*
+                         * skip large extents
+                         */
+                        if (skip) {
+                                skip = 0;
+                                goto next;
+                        }
+                        mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+                             "alloc_size: %u, len_defraged: %u\n",
+                             cpos, phys_cpos, alloc_size, len_defraged);
+                        ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+                                                  &alloc_size, flags);
+                } else {
+                        ret = ocfs2_move_extent(context, cpos, phys_cpos,
+                                                &new_phys_cpos, alloc_size,
+                                                flags);
+                        new_phys_cpos += alloc_size;
+                }
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                context->clusters_moved += alloc_size;
+next:
+                cpos += alloc_size;
+                len_to_move -= alloc_size;
+        }
+done:
+        range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+out:
+        range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+                                                      context->clusters_moved);
+        range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+                                                       context->new_phys_cpos);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &context->dealloc);
+        return ret;
+}
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+        int status;
+        handle_t *handle;
+        struct inode *inode = context->inode;
+        struct ocfs2_dinode *di;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!inode)
+                return -ENOENT;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * This prevents concurrent writes from other nodes
+         */
+        status = ocfs2_rw_lock(inode, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status) {
+                mlog_errno(status);
+                goto out_rw_unlock;
+        }
+        /*
+         * rememer ip_xattr_sem also needs to be held if necessary
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        status = __ocfs2_move_extents_range(di_bh, context);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        if (status) {
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        /*
+         * We update ctime for these changes
+         */
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_inode_unlock;
+        }
+        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        inode->i_ctime = CURRENT_TIME;
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_inode_unlock:
+        brelse(di_bh);
+        ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+        ocfs2_rw_unlock(inode, 1);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return status;
+}
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+        int status;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct ocfs2_move_extents range;
+        struct ocfs2_move_extents_context *context = NULL;
+        status = mnt_want_write(filp->f_path.mnt);
+        if (status)
+                return status;
+        if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+                goto out;
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+                status = -EPERM;
+                goto out;
+        }
+        context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+        if (!context) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        context->inode = inode;
+        context->file = filp;
+        if (argp) {
+                if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+                                   sizeof(range))) {
+                        status = -EFAULT;
+                        goto out;
+                }
+        } else {
+                status = -EINVAL;
+                goto out;
+        }
+        if (range.me_start > i_size_read(inode))
+                goto out;
+        if (range.me_start + range.me_len > i_size_read(inode))
+                        range.me_len = i_size_read(inode) - range.me_start;
+        context->range = &range;
+        if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+                context->auto_defrag = 1;
+                /*
+                 * ok, the default theshold for the defragmentation
+                 * is 1M, since our maximum clustersize was 1M also.
+                 * any thought?
+                 */
+                if (!range.me_threshold)
+                        range.me_threshold = 1024 * 1024;
+                if (range.me_threshold > i_size_read(inode))
+                        range.me_threshold = i_size_read(inode);
+                if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+                        context->partial = 1;
+        } else {
+                /*
+                 * first best-effort attempt to validate and adjust the goal
+                 * (physical address in block), while it can't guarantee later
+                 * operation can succeed all the time since global_bitmap may
+                 * change a bit over time.
+                 */
+                status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+                if (status)
+                        goto out;
+        }
+        status = ocfs2_move_extents(context);
+        if (status)
+                mlog_errno(status);
+out:
+        /*
+         * movement/defragmentation may end up being partially completed,
+         * that's the reason why we need to return userspace the finished
+         * length and new_offset even if failure happens somewhere.
+         */
+        if (argp) {
+                if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+                                sizeof(range)))
+                        status = -EFAULT;
+        }
+        kfree(context);
+        mnt_drop_write(filp->f_path.mnt);
+        return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
        __u64 ij_journal_size;
 };
+struct ocfs2_info_freeinode {
+        struct ocfs2_info_request ifi_req;
+        struct ocfs2_info_local_freeinode {
+                __u64 lfi_total;
+                __u64 lfi_free;
+        } ifi_stat[OCFS2_MAX_SLOTS];
+        __u32 ifi_slotnum; /* out */
+        __u32 ifi_pad;
+};
+#define OCFS2_INFO_MAX_HIST     (32)
+struct ocfs2_info_freefrag {
+        struct ocfs2_info_request iff_req;
+        struct ocfs2_info_freefrag_stats { /* (out) */
+                struct ocfs2_info_free_chunk_list {
+                        __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+                        __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+                } ffs_fc_hist;
+                __u32 ffs_clusters;
+                __u32 ffs_free_clusters;
+                __u32 ffs_free_chunks;
+                __u32 ffs_free_chunks_real;
+                __u32 ffs_min; /* Minimum free chunksize in clusters */
+                __u32 ffs_max;
+                __u32 ffs_avg;
+                __u32 ffs_pad;
+        } iff_ffs;
+        __u32 iff_chunksize; /* chunksize in clusters(in) */
+        __u32 iff_pad;
+};
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
        OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
        OCFS2_INFO_UUID,
        OCFS2_INFO_FS_FEATURES,
        OCFS2_INFO_JOURNAL_SIZE,
+        OCFS2_INFO_FREEINODE,
+        OCFS2_INFO_FREEFRAG,
        OCFS2_INFO_NUM_TYPES
 };
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
 #define OCFS2_IOC_INFO          _IOR('o', 5, struct ocfs2_info)
+struct ocfs2_move_extents {
+/* All values are in bytes */
+        /* in */
+        __u64 me_start;         /* Virtual start in the file to move */
+        __u64 me_len;           /* Length of the extents to be moved */
+        __u64 me_goal;          /* Physical offset of the goal,
+                                   it's in block unit */
+        __u64 me_threshold;     /* Maximum distance from goal or threshold
+                                   for auto defragmentation */
+        __u64 me_flags;         /* Flags for the operation:
+                                 * - auto defragmentation.
+                                 * - refcount,xattr cases.
+                                 */
+        /* out */
+        __u64 me_moved_len;     /* Moved/defraged length */
+        __u64 me_new_offset;    /* Resulting physical location */
+        __u32 me_reserved[2];   /* Reserved for futhure */
+};
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG   (0x00000001)    /* Kernel manages to
+                                                           claim new clusters
+                                                           as the goal place
+                                                           for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG   (0x00000002)    /* Allow partial extent
+                                                           moving, is to make
+                                                           movement less likely
+                                                           to fail, may make fs
+                                                           even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE      (0x00000004)    /* Move or defragmenation
+                                                           completely gets done.
+                                                         */
+#define OCFS2_IOC_MOVE_EXT      _IOW('o', 6, struct ocfs2_move_extents)
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5bb54ac..3b481f490633 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
                  __entry->blkno, __entry->bit)
 );
+TRACE_EVENT(ocfs2_trim_extent,
+        TP_PROTO(struct super_block *sb, unsigned long long blk,
+                 unsigned long long count),
+        TP_ARGS(sb, blk, count),
+        TP_STRUCT__entry(
+                __field(int, dev_major)
+                __field(int, dev_minor)
+                __field(unsigned long long, blk)
+                __field(__u64,  count)
+        ),
+        TP_fast_assign(
+                __entry->dev_major = MAJOR(sb->s_dev);
+                __entry->dev_minor = MINOR(sb->s_dev);
+                __entry->blk = blk;
+                __entry->count = count;
+        ),
+        TP_printk("%d %d %llu %llu",
+                  __entry->dev_major, __entry->dev_minor,
+                  __entry->blk, __entry->count)
+);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
 /* End of trace events for fs/ocfs2/alloc.c. */
 /* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
                            u32 *num_clusters,
                            unsigned int *extent_flags);
        int (*cow_duplicate_clusters)(handle_t *handle,
-                                      struct ocfs2_cow_context *context,
+                                      struct file *file,
                                      u32 cpos, u32 old_cluster,
                                      u32 new_cluster, u32 new_len);
 };
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                            struct ocfs2_cow_context *context,
+                                     struct file *file,
-                                            u32 cpos, u32 old_cluster,
+                                     u32 cpos, u32 old_cluster,
-                                            u32 new_cluster, u32 new_len)
+                                     u32 new_cluster, u32 new_len)
 {
        int ret = 0, partial;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
        unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
-        struct address_space *mapping = context->inode->i_mapping;
+        struct address_space *mapping = inode->i_mapping;
        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
                                               new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
         * We only duplicate pages until we reach the page contains i_size - 1.
         * So trim 'end' to i_size.
         */
-        if (end > i_size_read(context->inode))
+        if (end > i_size_read(inode))
-                end = i_size_read(context->inode);
+                end = i_size_read(inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
-                if (PageReadahead(page) && context->file) {
+                if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                                   &context->file->f_ra,
+                                                   &file->f_ra, file,
-                                                   context->file,
                                                   page, page_index,
                                                   readahead_pages);
                }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        }
                }
-                ocfs2_map_and_dirty_page(context->inode,
+                ocfs2_map_and_dirty_page(inode, handle, from, to,
-                                         handle, from, to,
                                         page, 0, &new_block);
                mark_page_accessed(page);
 unlock:
@@ -3015,14 +3014,15 @@ unlock:
        return ret;
 }
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                           struct ocfs2_cow_context *context,
+                                    struct file *file,
-                                           u32 cpos, u32 old_cluster,
+                                    u32 cpos, u32 old_cluster,
-                                           u32 new_cluster, u32 new_len)
+                                    u32 new_cluster, u32 new_len)
 {
        int ret = 0;
-        struct super_block *sb = context->inode->i_sb;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_caching_info *ci = context->data_et.et_ci;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
        u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                ret = context->cow_duplicate_clusters(handle, context, cpos,
+                ret = context->cow_duplicate_clusters(handle, context->file,
-                                                      old, new, len);
+                                                      cpos, old, new, len);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3162,22 +3162,22 @@ out:
        return ret;
 }
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
+int ocfs2_cow_sync_writeback(struct super_block *sb,
-                                    struct ocfs2_cow_context *context,
+                             struct inode *inode,
-                                    u32 cpos, u32 num_clusters)
+                             u32 cpos, u32 num_clusters)
 {
        int ret = 0;
        loff_t offset, end, map_end;
        pgoff_t page_index;
        struct page *page;
-        if (ocfs2_should_order_data(context->inode))
+        if (ocfs2_should_order_data(inode))
                return 0;
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
-        ret = filemap_fdatawrite_range(context->inode->i_mapping,
+        ret = filemap_fdatawrite_range(inode->i_mapping,
                                       offset, end - 1);
        if (ret < 0) {
                mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
-                page = find_or_create_page(context->inode->i_mapping,
+                page = find_or_create_page(inode->i_mapping,
                                           page_index, GFP_NOFS);
                BUG_ON(!page);
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
                             struct buffer_head *ref_root_bh,
                             u32 cpos, u32 write_len,
                             struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+                                     struct file *file,
+                                     u32 cpos, u32 old_cluster,
+                                     u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+                                    struct file *file,
+                                    u32 cpos, u32 old_cluster,
+                                    u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+                             struct inode *inode,
+                             u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
                            struct ocfs2_extent_tree *data_et,
                            struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5a521c748859..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/cleancache.h>
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
@@ -1566,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->preferred_slot != OCFS2_INVALID_SLOT)
                seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
-        if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+        if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
        if (osb->osb_commit_interval)
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                mlog_errno(status);
                goto bail;
        }
+        cleancache_init_shared_fs((char *)&uuid_net_key, sb);
 bail:
        return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
-        if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
-                return -ENOTEMPTY;
+        if (S_ISDIR(inode->i_mode)) {
+                dentry_unhash(dentry);
+                if (!omfs_dir_is_empty(inode))
+                        return -ENOTEMPTY;
+        }
        ret = omfs_delete_entry(dentry);
        if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err;
        if (new_inode) {
+                if (S_ISDIR(new_inode->i_mode))
+                        dentry_unhash(new_dentry);
                /* overwriting existing file/dir */
                err = omfs_remove(new_dir, new_dentry);
                if (err)
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y	+= stat.o
 proc-y  += uptime.o
 proc-y  += version.o
 proc-y  += softirqs.o
+proc-y  += namespaces.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..dc8bca72b002 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
        return allowed;
 }
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
 {
        int error;
        struct inode *inode = dentry->d_inode;
@@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task)
        return 0;
 }
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
        struct inode * inode;
        struct proc_inode *ei;
@@ -1779,7 +1778,7 @@ out_unlock:
        return NULL;
 }
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
@@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 * made this apply to all per process world readable and executable
 * directories.
 */
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode;
        struct task_struct *task;
@@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
 {
        .d_revalidate   = pid_revalidate,
        .d_delete       = pid_delete_dentry,
@@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations =
 /* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-                                struct task_struct *, const void *);
 /*
 * Fill a directory entry.
 *
@@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-        char *name, int len,
+        const char *name, int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
        struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
@@ -3168,6 +3165,7 @@ out_no_task:
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+        DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
        struct ctl_table_header *head;
+        const struct proc_ns_operations *ns_ops;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
                sysctl_head_put(head);
        }
+        /* Release any associated namespace */
+        ns_ops = PROC_I(inode)->ns_ops;
+        if (ns_ops && ns_ops->put)
+                ns_ops->put(PROC_I(inode)->ns);
 }
 static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
+        ei->ns = NULL;
+        ei->ns_ops = NULL;
        inode = &ei->vfs_inode;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3763b436e69d..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -127,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 */
 int proc_readdir(struct file *, void *, filldir_t);
 struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+                                struct task_struct *, const void *);
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+        const char *name, int len,
+        instantiate_t instantiate, struct task_struct *task, const void *ptr);
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
+extern const struct dentry_operations pid_dentry_operations;
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int proc_setattr(struct dentry *dentry, struct iattr *attr);
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+        &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+        &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+        &ipcns_operations,
+#endif
+};
+static const struct file_operations ns_file_operations = {
+        .llseek         = no_llseek,
+};
+static struct dentry *proc_ns_instantiate(struct inode *dir,
+        struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+        const struct proc_ns_operations *ns_ops = ptr;
+        struct inode *inode;
+        struct proc_inode *ei;
+        struct dentry *error = ERR_PTR(-ENOENT);
+        inode = proc_pid_make_inode(dir->i_sb, task);
+        if (!inode)
+                goto out;
+        ei = PROC_I(inode);
+        inode->i_mode = S_IFREG|S_IRUSR;
+        inode->i_fop  = &ns_file_operations;
+        ei->ns_ops    = ns_ops;
+        ei->ns        = ns_ops->get(task);
+        if (!ei->ns)
+                goto out_iput;
+        dentry->d_op = &pid_dentry_operations;
+        d_add(dentry, inode);
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                error = NULL;
+out:
+        return error;
+out_iput:
+        iput(inode);
+        goto out;
+}
+static int proc_ns_fill_cache(struct file *filp, void *dirent,
+        filldir_t filldir, struct task_struct *task,
+        const struct proc_ns_operations *ops)
+{
+        return proc_fill_cache(filp, dirent, filldir,
+                                ops->name, strlen(ops->name),
+                                proc_ns_instantiate, task, ops);
+}
+static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+                                filldir_t filldir)
+{
+        int i;
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
+        const struct proc_ns_operations **entry, **last;
+        ino_t ino;
+        int ret;
+        ret = -ENOENT;
+        if (!task)
+                goto out_no_task;
+        ret = -EPERM;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        ret = 0;
+        i = filp->f_pos;
+        switch (i) {
+        case 0:
+                ino = inode->i_ino;
+                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                        goto out;
+                i++;
+                filp->f_pos++;
+                /* fall through */
+        case 1:
+                ino = parent_ino(dentry);
+                if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+                        goto out;
+                i++;
+                filp->f_pos++;
+                /* fall through */
+        default:
+                i -= 2;
+                if (i >= ARRAY_SIZE(ns_entries)) {
+                        ret = 1;
+                        goto out;
+                }
+                entry = ns_entries + i;
+                last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+                while (entry <= last) {
+                        if (proc_ns_fill_cache(filp, dirent, filldir,
+                                                task, *entry) < 0)
+                                goto out;
+                        filp->f_pos++;
+                        entry++;
+                }
+        }
+        ret = 1;
+out:
+        put_task_struct(task);
+out_no_task:
+        return ret;
+}
+const struct file_operations proc_ns_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = proc_ns_dir_readdir,
+};
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+                                struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *error;
+        struct task_struct *task = get_proc_task(dir);
+        const struct proc_ns_operations **entry, **last;
+        unsigned int len = dentry->d_name.len;
+        error = ERR_PTR(-ENOENT);
+        if (!task)
+                goto out_no_task;
+        error = ERR_PTR(-EPERM);
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+        for (entry = ns_entries; entry <= last; entry++) {
+                if (strlen((*entry)->name) != len)
+                        continue;
+                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+                        break;
+        }
+        error = ERR_PTR(-ENOENT);
+        if (entry > last)
+                goto out;
+        error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+        put_task_struct(task);
+out_no_task:
+        return error;
+}
+const struct inode_operations proc_ns_dir_inode_operations = {
+        .lookup         = proc_ns_dir_lookup,
+        .getattr        = pid_getattr,
+        .setattr        = proc_setattr,
+};
+struct file *proc_ns_fget(int fd)
+{
+        struct file *file;
+        file = fget(fd);
+        if (!file)
+                return ERR_PTR(-EBADF);
+        if (file->f_op != &ns_file_operations)
+                goto out_invalid;
+        return file;
+out_invalid:
+        fput(file);
+        return ERR_PTR(-EINVAL);
+}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2c9db29ea358..db15935fa757 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
-        int flags = vma->vm_flags;
+        vm_flags_t flags = vma->vm_flags;
        unsigned long ino = 0;
        unsigned long long pgoff = 0;
        unsigned long start, end;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
        INITIALIZE_PATH(path);
        struct reiserfs_dir_entry de;
+        dentry_unhash(dentry);
        /* we will be doing 2 balancings and update 2 stat data, we change quotas
         * of the owner of the directory and of the owner of the parent directory.
         * The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        unsigned long savelink = 1;
        struct timespec ctime;
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+                dentry_unhash(new_dentry);
        /* three balancings: (1) old name removal, (2) new name insertion
           and (3) maybe "save" link insertion
           stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
        if (!error)
                d_delete(dentry);
-        dput(dentry);
        return error;
 }
diff --git a/fs/super.c b/fs/super.c
index c04f7e0b7ed2..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
 #include "internal.h"
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                s->s_maxbytes = MAX_NON_LFS;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
+                s->cleancache_poolid = -1;
        }
 out:
        return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
+                cleancache_flush_fs(s);
                fs->kill_sb(s);
                /*
                 * We need to call rcu_barrier so all the delayed rcu free
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
        struct inode *inode = dentry->d_inode;
        int err = -ENOTEMPTY;
+        dentry_unhash(dentry);
        if (sysv_empty_dir(inode)) {
                err = sysv_unlink(dir, dentry);
                if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
        struct sysv_dir_entry * old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = sysv_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef5abd38f0bf..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+        dentry_unhash(dentry);
        /*
         * Budget request settings: deletion direntry, deletion inode and
         * changing the parent inode. If budgeting fails, go ahead anyway
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        /*
         * Budget request settings: deletion direntry, new direntry, removing
         * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
+        dentry_unhash(dentry);
        retval = -ENOENT;
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
+        dentry_unhash(dentry);
        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
+        if (new_inode && S_ISDIR(new_inode->i_mode))
+                dentry_unhash(new_dentry);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
                return -XFS_ERROR(EFAULT);
        return 0;
 }
+int
+xfs_discard_extents(
+        struct xfs_mount        *mp,
+        struct list_head        *list)
+{
+        struct xfs_busy_extent  *busyp;
+        int                     error = 0;
+        list_for_each_entry(busyp, list, list) {
+                trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+                                         busyp->length);
+                error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+                                XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+                                XFS_FSB_TO_BB(mp, busyp->length),
+                                GFP_NOFS, 0);
+                if (error && error != EOPNOTSUPP) {
+                        xfs_info(mp,
+         "discard failed for extent [0x%llu,%u], error %d",
+                                 (unsigned long long)busyp->bno,
+                                 busyp->length,
+                                 error);
+                        return error;
+                }
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
 #define XFS_DISCARD_H 1
 struct fstrim_range;
+struct list_head;
 extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int      xfs_discard_extents(struct xfs_mount *, struct list_head *);
 #endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b0aa59e51fd0..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_DELAYLOG    "delaylog"   /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
 /*
 * Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+                        mp->m_flags |= XFS_MOUNT_DISCARD;
+                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, "ihashsize")) {
                        xfs_warn(mp,
        "ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
                return EINVAL;
        }
+        if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+            !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+                xfs_warn(mp,
+        "the discard option is incompatible with the nodelaylog option");
+                return EINVAL;
+        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
                xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
+                { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index da0a561ffba2..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,6 +187,9 @@ struct xfs_busy_extent {
        xfs_agnumber_t  agno;
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
+        unsigned int    flags;
+#define XFS_ALLOC_BUSY_DISCARDED        0x01    /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD     0x02    /* do not discard */
 };
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index acdced86413c..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
        if (!error)
-                xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
+                xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
        return error;
@@ -2480,7 +2480,8 @@ xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
        xfs_agblock_t           bno,
-        xfs_extlen_t            len)
+        xfs_extlen_t            len,
+        unsigned int            flags)
 {
        struct xfs_busy_extent  *new;
        struct xfs_busy_extent  *busyp;
@@ -2504,6 +2505,7 @@ xfs_alloc_busy_insert(
        new->bno = bno;
        new->length = len;
        INIT_LIST_HEAD(&new->list);
+        new->flags = flags;
        /* trace before insert to be able to see failed inserts */
        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2609,6 +2611,18 @@ xfs_alloc_busy_update_extent(
        xfs_agblock_t           bend = bbno + busyp->length;
        /*
+         * This extent is currently being discarded.  Give the thread
+         * performing the discard a chance to mark the extent unbusy
+         * and retry.
+         */
+        if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+                spin_unlock(&pag->pagb_lock);
+                delay(1);
+                spin_lock(&pag->pagb_lock);
+                return false;
+        }
+        /*
         * If there is a busy extent overlapping a user allocation, we have
         * no choice but to force the log and retry the search.
         *
@@ -2813,7 +2827,8 @@ restart:
                 * If this is a metadata allocation, try to reuse the busy
                 * extent instead of trimming the allocation.
                 */
-                if (!args->userdata) {
+                if (!args->userdata &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
                        if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
                                                          busyp, fbno, flen,
                                                          false))
@@ -2979,10 +2994,16 @@ xfs_alloc_busy_clear_one(
        kmem_free(busyp);
 }
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
 void
 xfs_alloc_busy_clear(
        struct xfs_mount        *mp,
-        struct list_head        *list)
+        struct list_head        *list,
+        bool                    do_discard)
 {
        struct xfs_busy_extent  *busyp, *n;
        struct xfs_perag        *pag = NULL;
@@ -2999,7 +3020,11 @@ xfs_alloc_busy_clear(
                        agno = busyp->agno;
                }
-                xfs_alloc_busy_clear_one(mp, pag, busyp);
+                if (do_discard && busyp->length &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+                        busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+                else
+                        xfs_alloc_busy_clear_one(mp, pag, busyp);
        }
        if (pag) {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 240ad288f2f9..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,10 +137,11 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
 xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-        xfs_agblock_t bno, xfs_extlen_t len);
+        xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+        bool do_discard);
 int
 xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 8b469d53599f..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -120,7 +120,8 @@ xfs_allocbt_free_block(
        if (error)
                return error;
-        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+                              XFS_ALLOC_BUSY_SKIP_DISCARD);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
        int                     *flags);        /* inode logging flags */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);  /* OK to allocate reserved blocks */
-/*
 * Called by xfs_bmap_add_extent to handle cases converting a delayed
 * allocation to a real allocation.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
        int                     whichfork); /* data or attr fork */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_del_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_trans_t             *tp,    /* current trans pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        xfs_btree_cur_t         *cur,   /* if null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);   /* OK to allocate reserved blocks */
-/*
 * Remove the entry "free" from the free item list.  Prev points to the
 * previous entry, unless "free" is the head of the list.
 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
 STATIC int                              /* error */
 xfs_bmap_add_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to use reserved data blocks */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor or null */
        xfs_filblks_t           da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
        xfs_extnum_t            nextents; /* number of extents in file now */
        XFS_STATS_INC(xs_add_exlist);
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        ASSERT(idx <= nextents);
        da_old = da_new = 0;
        error = 0;
+        ASSERT(*idx >= 0);
+        ASSERT(*idx <= nextents);
        /*
         * This is the first extent added to a new/empty file.
         * Special case this one, so other routines get to assume there are
         * already extents in the list.
         */
        if (nextents == 0) {
-                xfs_iext_insert(ip, 0, 1, new,
+                xfs_iext_insert(ip, *idx, 1, new,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
                ASSERT(cur == NULL);
-                ifp->if_lastex = 0;
                if (!isnullstartblock(new->br_startblock)) {
                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+                error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                &logflags, rsvd)))
+                                                       &logflags);
-                        goto done;
        }
        /*
         * Real allocation off the end of the file.
         */
-        else if (idx == nextents) {
+        else if (*idx == nextents) {
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+                error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, whichfork)))
+                                &logflags, whichfork);
-                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
                /*
                 * Get the record referred to by idx.
                 */
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
                /*
                 * If it's a real allocation record, and the new allocation ends
                 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
                                if (cur)
                                        ASSERT(cur->bc_private.b.flags &
                                                XFS_BTCUR_BPRV_WASDEL);
-                                if ((error = xfs_bmap_add_extent_delay_real(ip,
+                                error = xfs_bmap_add_extent_delay_real(ip,
-                                        idx, &cur, new, &da_new, first, flist,
+                                                idx, &cur, new, &da_new,
-                                        &logflags, rsvd)))
+                                                first, flist, &logflags);
-                                        goto done;
-                        } else if (new->br_state == XFS_EXT_NORM) {
-                                ASSERT(new->br_state == XFS_EXT_NORM);
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags)))
-                                        goto done;
                        } else {
-                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+                                ASSERT(new->br_state == XFS_EXT_NORM ||
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
+                                       new->br_state == XFS_EXT_UNWRITTEN);
-                                        ip, idx, &cur, new, &logflags)))
+                                error = xfs_bmap_add_extent_unwritten_real(ip,
+                                                idx, &cur, new, &logflags);
+                                if (error)
                                        goto done;
                        }
-                        ASSERT(*curp == cur || *curp == NULL);
                }
                /*
                 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
                        if (cur)
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
-                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+                        error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, whichfork)))
+                                        new, &logflags, whichfork);
-                                goto done;
                }
        }
+        if (error)
+                goto done;
        ASSERT(*curp == cur || *curp == NULL);
        /*
         * Convert to a btree if necessary.
         */
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
                ASSERT(nblks <= da_old);
                if (nblks < da_old)
                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                (int64_t)(da_old - nblks), rsvd);
+                                (int64_t)(da_old - nblks), 0);
        }
        /*
         * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        int                     diff;   /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
         */
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, PREV.br_state)))
                                goto done;
                }
                *dnew = 0;
                break;
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
                break;
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
                *dnew = temp;
                break;
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, new_endoff);
                temp = PREV.br_blockcount - new->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx + 1);
+                ep = xfs_iext_get_ext(ifp, *idx + 1);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
                *dnew = temp;
                break;
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is contiguous with the new allocation.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount,
                        RIGHT.br_state);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_state)))
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is not contiguous.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
                 */
                temp = new->br_startoff - PREV.br_startoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
                LEFT = *new;
                RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
                RIGHT.br_startoff = new_endoff;
                RIGHT.br_blockcount = temp2;
                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
+                xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                             -((int64_t)diff), rsvd)) {
+                                             -((int64_t)diff), 0)) {
                        /*
                         * Ick gross gag me with a spoon.
                         */
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
                                        if (!diff ||
                                            !xfs_icsb_modify_counters(ip->i_mount,
                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), rsvd))
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                                if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
                                        if (!diff ||
                                            !xfs_icsb_modify_counters(ip->i_mount,
                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), rsvd))
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                        }
                }
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
                        nullstartblock((int)temp2));
-                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp + temp2;
                break;
@@ -1161,7 +1131,7 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
        error = 0;
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        newext = new->br_state;
        oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents -= 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
                xfs_bmbt_set_startoff(ep, new_endoff);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is contiguous with the new allocation.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                ++*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount, newext);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * newext.  Contiguity is impossible here.
                 * One extent becomes three extents.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        new->br_startoff - PREV.br_startoff);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                r[0] = *new;
                r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
                r[1].br_startblock = new->br_startblock + new->br_blockcount;
                r[1].br_state = oldext;
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-                ip->i_df.if_lastex = idx + 1;
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 2, &r[0], state);
                ip->i_d.di_nextents += 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)           /* OK to allocate reserved blocks */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* extent record for idx */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
        xfs_filblks_t           newlen=0;       /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        ASSERT(isnullstartblock(new->br_startblock));
        /*
         * Check and set flags if this segment has a left neighbor
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
         * Check and set flags if the current (right) segment exists.
         * If it doesn't exist, we're converting the hole at end-of-file.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left and on the right.
                 * Merge all three into a single extent record.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount +
                        right.br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 1, state);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                temp = new->br_blockcount + right.br_blockcount;
                oldlen = startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_allf(ep, new->br_startoff,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff,
                        nullstartblock((int)newlen), temp, right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                break;
        case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
                 * Insert a new entry.
                 */
                oldlen = newlen = 0;
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                break;
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                        (int64_t)(oldlen - newlen), rsvd);
+                        (int64_t)(oldlen - newlen), 0);
                /*
                 * Nothing to do for disk quota accounting here.
                 */
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* pointer to extent entry ins. point */
        int                     error;  /* error return value */
        int                     i;      /* temp state */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
        int                     state;  /* state bits, accessed thru macros */
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
        /*
         * Check and set flags if this segment has a left neighbor.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
        }
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
         * Check and set flags if this segment has a current value.
         * Not true if we're inserting into the "hole" at eof.
         */
-        if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
                 * left and on the right.
                 * Merge all three into a single extent record.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount +
                        right.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
-                ifp->if_lastex = idx - 1;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + right.br_blockcount,
                        right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
                 * real allocation.
                 * Insert a new entry.
                 */
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ifp->if_lastex = idx;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int				/* error */
 xfs_bmap_del_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_trans_t             *tp,    /* current transaction pointer */
-        xfs_extnum_t            idx,    /* extent number to update/delete */
+        xfs_extnum_t            *idx,   /* extent number to update/delete */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to allocate reserved blocks */
 {
        xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
        xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+        ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
                (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(del->br_blockcount > 0);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &got);
        ASSERT(got.br_startoff <= del->br_startoff);
        del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
                /*
                 * Matches the whole extent.  Delete the entry.
                 */
-                xfs_iext_remove(ip, idx, 1,
+                xfs_iext_remove(ip, *idx, 1,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-                ifp->if_lastex = idx;
+                --*idx;
                if (delay)
                        break;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
                /*
                 * Deleting the first part of the extent.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, del_endoff);
                temp = got.br_blockcount - del->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
                xfs_bmbt_set_startblock(ep, del_endblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
                 * Deleting the last part of the extent.
                 */
                temp = got.br_blockcount - del->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
                 * Deleting the middle of the extent.
                 */
                temp = del->br_startoff - got.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                new.br_startoff = del_endoff;
                temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
                                }
                        }
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 1, &new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-                ifp->if_lastex = idx + 1;
+                ++*idx;
                break;
        }
        /*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
        ASSERT(da_old >= da_new);
        if (da_old > da_new) {
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                        (int64_t)(da_old - da_new), rsvd);
+                        (int64_t)(da_old - da_new), 0);
        }
 done:
        *logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
                                if (rt) {
                                        error = xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        -((int64_t)extsz), (flags &
+                                                        -((int64_t)extsz), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                } else {
                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)alen), (flags &
+                                                        -((int64_t)alen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (!error) {
                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)indlen), (flags &
+                                                        -((int64_t)indlen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        if (error && rt)
                                                xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        (int64_t)extsz, (flags &
+                                                        (int64_t)extsz, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        else if (error)
                                                xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        (int64_t)alen, (flags &
+                                                        (int64_t)alen, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
                                                ? XFS_EXT_NORM
                                                : XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
                /*
                 * Else go on to the next record.
                 */
-                ep = xfs_iext_get_ext(ifp, ++lastx);
                prev = got;
-                if (lastx >= nextents)
+                if (++lastx < nextents) {
-                        eof = 1;
+                        ep = xfs_iext_get_ext(ifp, lastx);
-                else
                        xfs_bmbt_get_all(ep, &got);
+                } else {
+                        eof = 1;
+                }
        }
-        ifp->if_lastex = lastx;
        *nmap = n;
        /*
         * Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
        ASSERT(!isnullstartblock(got.br_startblock));
        ASSERT(bno < got.br_startoff + got.br_blockcount);
        *fsb = got.br_startblock + (bno - got.br_startoff);
-        ifp->if_lastex = lastx;
        return 0;
 }
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
        int                     tmp_logflags;   /* partial logging flags */
        int                     wasdel;         /* was a delayed alloc extent */
        int                     whichfork;      /* data or attribute fork */
-        int                     rsvd;           /* OK to allocate reserved blocks */
        xfs_fsblock_t           sum;
        trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
        mp = ip->i_mount;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
        ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
                                del.br_blockcount = mod;
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
                                firstblock, flist, &logflags,
-                                XFS_DATA_FORK, 0);
+                                XFS_DATA_FORK);
                        if (error)
                                goto error0;
                        goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
                                 */
                                ASSERT(bno >= del.br_blockcount);
                                bno -= del.br_blockcount;
-                                if (bno < got.br_startoff) {
+                                if (got.br_startoff > bno) {
-                                        if (--lastx >= 0)
+                                        if (--lastx >= 0) {
-                                                xfs_bmbt_get_all(--ep, &got);
+                                                ep = xfs_iext_get_ext(ifp,
+                                                                      lastx);
+                                                xfs_bmbt_get_all(ep, &got);
+                                        }
                                }
                                continue;
                        } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
                                        prev.br_startoff = start;
                                }
                                prev.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+                                lastx--;
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
                        } else {
                                ASSERT(del.br_state == XFS_EXT_NORM);
                                del.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx, &cur,
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                do_div(rtexts, mp->m_sb.sb_rextsize);
                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                                (int64_t)rtexts, rsvd);
+                                                (int64_t)rtexts, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                (int64_t)del.br_blockcount, rsvd);
+                                                (int64_t)del.br_blockcount, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
                        error = XFS_ERROR(ENOSPC);
                        goto error0;
                }
-                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-                                &tmp_logflags, whichfork, rsvd);
+                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
                bno = del.br_startoff - 1;
 nodelete:
-                lastx = ifp->if_lastex;
                /*
                 * If not done go on to the next (previous) record.
-                 * Reset ep in case the extents array was re-alloced.
                 */
-                ep = xfs_iext_get_ext(ifp, lastx);
                if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-                        if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+                        if (lastx >= 0) {
-                            xfs_bmbt_get_startoff(ep) > bno) {
+                                ep = xfs_iext_get_ext(ifp, lastx);
-                                if (--lastx >= 0)
+                                if (xfs_bmbt_get_startoff(ep) > bno) {
-                                        ep = xfs_iext_get_ext(ifp, lastx);
+                                        if (--lastx >= 0)
-                        }
+                                                ep = xfs_iext_get_ext(ifp,
-                        if (lastx >= 0)
+                                                                      lastx);
+                                }
                                xfs_bmbt_get_all(ep, &got);
+                        }
                        extno++;
                }
        }
-        ifp->if_lastex = lastx;
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS     0x020   /* OK to alloc. reserved data blocks */
 #define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
 #define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
@@ -87,7 +86,6 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8e3349c287c..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
        /*
         * We know that the size is valid (it's checked in iformat_btree)
         */
-        ifp->if_lastex = NULLEXTNUM;
        ifp->if_bytes = ifp->if_real_bytes = 0;
        ifp->if_flags |= XFS_IFEXTENTS;
        xfs_iext_add(ifp, 0, nextents);
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_EXTENTS:
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
-                ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
-                        (ifp->if_bytes == 0));
-                ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
-                        (ifp->if_bytes > 0));
                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
+                        ASSERT(xfs_iext_get_ext(ifp, 0));
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
                                whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
        xfs_extnum_t    idx)            /* index of target extent */
 {
        ASSERT(idx >= 0);
+        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
                return ifp->if_u1.if_ext_irec->er_extbuf;
        } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
                }
                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
                ifp->if_real_bytes = 0;
-                ifp->if_lastex = nextents + ext_diff;
        }
        /*
         * Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-        ASSERT(page_idx >= 0 && page_idx <=
+        ASSERT(page_idx >= 0);
-                ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        erp_idx = 0;
        low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
-        xfs_extnum_t            if_lastex;      /* last if_extents used */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7d56e88a3f0e..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
+#include "xfs_discard.h"
 /*
 * Perform initial CIL structure initialisation. If the CIL is not
@@ -361,18 +362,28 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
+        struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
                                        ctx->start_lsn, abort);
        xfs_alloc_busy_sort(&ctx->busy_extents);
-        xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
+        xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
        spin_lock(&ctx->cil->xc_cil_lock);
        list_del(&ctx->committing);
        spin_unlock(&ctx->cil->xc_cil_lock);
        xlog_cil_free_logvec(ctx->lv_chain);
+        if (!list_empty(&ctx->busy_extents)) {
+                ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+                xfs_discard_extents(mp, &ctx->busy_extents);
+                xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+        }
        kmem_free(ctx);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
                                                   disk errors in metadata */
+#define XFS_MOUNT_DISCARD       (1ULL << 5)     /* discard unused blocks */
 #define XFS_MOUNT_RETERR        (1ULL << 6)     /* return alignment errors to
                                                   user */
 #define XFS_MOUNT_NOALIGN       (1ULL << 7)     /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d1f24858ccc4..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
        struct xfs_trans        *tp)
 {
        xfs_alloc_busy_sort(&tp->t_busy);
-        xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
+        xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f5df23561b96..503c8a6b3079 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -217,8 +217,24 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
                        get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                                get_block_t get_block);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
+/* Convert errno to return value from ->page_mkwrite() call */
+static inline int block_page_mkwrite_return(int err)
+{
+        if (err == 0)
+                return VM_FAULT_LOCKED;
+        if (err == -EFAULT)
+                return VM_FAULT_NOPAGE;
+        if (err == -ENOMEM)
+                return VM_FAULT_OOM;
+        if (err == -EAGAIN)
+                return VM_FAULT_RETRY;
+        /* -ENOSPC, -EDQUOT, -EIO ... */
+        return VM_FAULT_SIGBUS;
+}
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
new file mode 100644
index 000000000000..04ffb2e6c9d0
--- /dev/null
+++ b/include/linux/cleancache.h
@@ -0,0 +1,122 @@
+#ifndef _LINUX_CLEANCACHE_H
+#define _LINUX_CLEANCACHE_H
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#define CLEANCACHE_KEY_MAX 6
+/*
+ * cleancache requires every file with a page in cleancache to have a
+ * unique key unless/until the file is removed/truncated.  For some
+ * filesystems, the inode number is unique, but for "modern" filesystems
+ * an exportable filehandle is required (see exportfs.h)
+ */
+struct cleancache_filekey {
+        union {
+                ino_t ino;
+                __u32 fh[CLEANCACHE_KEY_MAX];
+                u32 key[CLEANCACHE_KEY_MAX];
+        } u;
+};
+struct cleancache_ops {
+        int (*init_fs)(size_t);
+        int (*init_shared_fs)(char *uuid, size_t);
+        int (*get_page)(int, struct cleancache_filekey,
+                        pgoff_t, struct page *);
+        void (*put_page)(int, struct cleancache_filekey,
+                        pgoff_t, struct page *);
+        void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
+        void (*flush_inode)(int, struct cleancache_filekey);
+        void (*flush_fs)(int);
+};
+extern struct cleancache_ops
+        cleancache_register_ops(struct cleancache_ops *ops);
+extern void __cleancache_init_fs(struct super_block *);
+extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern int  __cleancache_get_page(struct page *);
+extern void __cleancache_put_page(struct page *);
+extern void __cleancache_flush_page(struct address_space *, struct page *);
+extern void __cleancache_flush_inode(struct address_space *);
+extern void __cleancache_flush_fs(struct super_block *);
+extern int cleancache_enabled;
+#ifdef CONFIG_CLEANCACHE
+static inline bool cleancache_fs_enabled(struct page *page)
+{
+        return page->mapping->host->i_sb->cleancache_poolid >= 0;
+}
+static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
+{
+        return mapping->host->i_sb->cleancache_poolid >= 0;
+}
+#else
+#define cleancache_enabled (0)
+#define cleancache_fs_enabled(_page) (0)
+#define cleancache_fs_enabled_mapping(_page) (0)
+#endif
+/*
+ * The shim layer provided by these inline functions allows the compiler
+ * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
+ * is disabled, to a single global variable check if CONFIG_CLEANCACHE
+ * is enabled but no cleancache "backend" has dynamically enabled it,
+ * and, for the most frequent cleancache ops, to a single global variable
+ * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
+ * and a cleancache backend has dynamically enabled cleancache, but the
+ * filesystem referenced by that cleancache op has not enabled cleancache.
+ * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
+ * no measurable performance impact.
+ */
+static inline void cleancache_init_fs(struct super_block *sb)
+{
+        if (cleancache_enabled)
+                __cleancache_init_fs(sb);
+}
+static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+        if (cleancache_enabled)
+                __cleancache_init_shared_fs(uuid, sb);
+}
+static inline int cleancache_get_page(struct page *page)
+{
+        int ret = -1;
+        if (cleancache_enabled && cleancache_fs_enabled(page))
+                ret = __cleancache_get_page(page);
+        return ret;
+}
+static inline void cleancache_put_page(struct page *page)
+{
+        if (cleancache_enabled && cleancache_fs_enabled(page))
+                __cleancache_put_page(page);
+}
+static inline void cleancache_flush_page(struct address_space *mapping,
+                                        struct page *page)
+{
+        /* careful... page->mapping is NULL sometimes when this is called */
+        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+                __cleancache_flush_page(mapping, page);
+}
+static inline void cleancache_flush_inode(struct address_space *mapping)
+{
+        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+                __cleancache_flush_inode(mapping);
+}
+static inline void cleancache_flush_fs(struct super_block *sb)
+{
+        if (cleancache_enabled)
+                __cleancache_flush_fs(sb);
+}
+#endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f9d3251790d..241609346dfb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1428,6 +1428,11 @@ struct super_block {
         */
        char __rcu *s_options;
        const struct dentry_operations *s_d_op; /* default d_op for dentries */
+        /*
+         * Saved pool identifier for cleancache (-1 means none)
+         */
+        int cleancache_poolid;
 };
 extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 943c76b3d4bb..59225ef27d15 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_HUGETLB_H
 #define _LINUX_HUGETLB_H
+#include <linux/mm_types.h>
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
@@ -41,7 +42,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
-                                                int acctflags);
+                                                vm_flags_t vm_flags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
@@ -168,7 +169,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                struct user_struct **user, int creat_flags);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
@@ -192,7 +193,7 @@ static inline void set_file_hugepages(struct file *file)
 #define is_file_hugepages(file)                 0
 #define set_file_hugepages(file)                BUG()
 static inline struct file *hugetlb_file_setup(const char *name, size_t size,
-                int acctflag, struct user_struct **user, int creat_flags)
+                vm_flags_t acctflag, struct user_struct **user, int creat_flags)
 {
        return ERR_PTR(-ENOSYS);
 }
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 6931489a5c14..2bb681fbeb35 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-        return vma->vm_flags & VM_HUGETLB;
+        return !!(vma->vm_flags & VM_HUGETLB);
 }
 #else
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f4a2e6b1b864..0ee969a5593d 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -136,6 +136,7 @@ enum {
        IFLA_PORT_SELF,
        IFLA_AF_SPEC,
        IFLA_GROUP,             /* Group the device belongs to */
+        IFLA_NET_NS_FD,
        __IFLA_MAX
 };
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a32dcaec04e1..4ecb7b16b278 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -529,9 +529,10 @@ struct transaction_s
        enum {
                T_RUNNING,
                T_LOCKED,
-                T_RUNDOWN,
                T_FLUSH,
                T_COMMIT,
+                T_COMMIT_DFLUSH,
+                T_COMMIT_JFLUSH,
                T_FINISHED
        }                       t_state;
@@ -658,7 +659,9 @@ struct transaction_s
         * waiting for it to finish.
         */
        unsigned int t_synchronous_commit:1;
-        unsigned int t_flushed_data_blocks:1;
+        /* Disk flush needs to be sent to fs partition [no locking] */
+        int                     t_need_data_flush;
        /*
         * For use by the filesystem to store fs-specific data
@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
 void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8eb969ebf904..fb8e814f78dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -165,12 +165,12 @@ extern pgprot_t protection_map[16];
 */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-        return (vma->vm_flags & VM_PFN_AT_MMAP);
+        return !!(vma->vm_flags & VM_PFN_AT_MMAP);
 }
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
 {
-        return (vma->vm_flags & VM_PFNMAP);
+        return !!(vma->vm_flags & VM_PFNMAP);
 }
 /*
@@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, unsigned long flags,
-        unsigned int vm_flags, unsigned long pgoff);
+        vm_flags_t vm_flags, unsigned long pgoff);
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 071d459e866b..6fe96c19f85e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -102,6 +102,8 @@ struct page {
 #endif
 };
+typedef unsigned long __nocast vm_flags_t;
 /*
 * A region containing a mapping of a non-memory backed file under NOMMU
 * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -109,7 +111,7 @@ struct page {
 */
 struct vm_region {
        struct rb_node  vm_rb;          /* link in global region tree */
-        unsigned long   vm_flags;       /* VMA vm_flags */
+        vm_flags_t      vm_flags;       /* VMA vm_flags */
        unsigned long   vm_start;       /* start address of region */
        unsigned long   vm_end;         /* region initialised to here */
        unsigned long   vm_top;         /* region allocated to here */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3686cd6c9aca..648c9c58add7 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -179,6 +179,8 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
+extern struct file *proc_ns_fget(int fd);
 #else
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
@@ -241,6 +243,11 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm,
                                   struct mm_struct *newmm)
 {}
+static inline struct file *proc_ns_fget(int fd)
+{
+        return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_PROC_FS */
 #if !defined(CONFIG_PROC_KCORE)
@@ -252,6 +259,18 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #endif
+struct nsproxy;
+struct proc_ns_operations {
+        const char *name;
+        int type;
+        void *(*get)(struct task_struct *task);
+        void (*put)(void *ns);
+        int (*install)(struct nsproxy *nsproxy, void *ns);
+};
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations utsns_operations;
+extern const struct proc_ns_operations ipcns_operations;
 union proc_op {
        int (*proc_get_link)(struct inode *, struct path *);
        int (*proc_read)(struct task_struct *task, char *page);
@@ -270,6 +289,8 @@ struct proc_inode {
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
+        void *ns;
+        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
 };
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ab71447d0c5a..8c03b98df5f9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -846,4 +846,5 @@ asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 asmlinkage long sys_open_by_handle_at(int mountdirfd,
                                      struct file_handle __user *handle,
                                      int flags);
+asmlinkage long sys_setns(int fd, int nstype);
 #endif
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3ae491932bc8..dcc8f5749d3f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -119,6 +119,7 @@ static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns)
 extern struct list_head net_namespace_list;
 extern struct net *get_net_ns_by_pid(pid_t pid);
+extern struct net *get_net_ns_by_fd(int pid);
 #ifdef CONFIG_NET_NS
 extern void __put_net(struct net *net);
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index b33257bc7e83..70213b4515eb 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_tmem_op              38
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -461,6 +462,27 @@ typedef uint8_t xen_domain_handle_t[16];
 #define __mk_unsigned_long(x) x ## UL
 #define mk_unsigned_long(x) __mk_unsigned_long(x)
+#define TMEM_SPEC_VERSION 1
+struct tmem_op {
+        uint32_t cmd;
+        int32_t pool_id;
+        union {
+                struct {  /* for cmd == TMEM_NEW_POOL */
+                        uint64_t uuid[2];
+                        uint32_t flags;
+                } new;
+                struct {
+                        uint64_t oid[3];
+                        uint32_t index;
+                        uint32_t tmem_offset;
+                        uint32_t pfn_offset;
+                        uint32_t len;
+                        GUEST_HANDLE(void) gmfn; /* guest machine page frame */
+                } gen;
+        } u;
+};
 #else /* __ASSEMBLY__ */
 /* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 8054c8e5faf1..ce0a647869b1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include "util.h"
@@ -140,3 +141,39 @@ void put_ipc_ns(struct ipc_namespace *ns)
                free_ipc_ns(ns);
        }
 }
+static void *ipcns_get(struct task_struct *task)
+{
+        struct ipc_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy)
+                ns = get_ipc_ns(nsproxy->ipc_ns);
+        rcu_read_unlock();
+        return ns;
+}
+static void ipcns_put(void *ns)
+{
+        return put_ipc_ns(ns);
+}
+static int ipcns_install(struct nsproxy *nsproxy, void *ns)
+{
+        /* Ditch state from the old ipc namespace */
+        exit_sem(current);
+        put_ipc_ns(nsproxy->ipc_ns);
+        nsproxy->ipc_ns = get_ipc_ns(ns);
+        return 0;
+}
+const struct proc_ns_operations ipcns_operations = {
+        .name           = "ipc",
+        .type           = CLONE_NEWIPC,
+        .get            = ipcns_get,
+        .put            = ipcns_put,
+        .install        = ipcns_install,
+};
diff --git a/ipc/shm.c b/ipc/shm.c
index 729acb7e3148..ab3385a21b27 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
        struct file * file;
        char name[13];
        int id;
-        int acctflag = 0;
+        vm_flags_t acctflag = 0;
        if (size < SHMMIN || size > ns->shm_ctlmax)
                return -EINVAL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..5424e37673ed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
 static struct kmem_cache *nsproxy_cachep;
@@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p)
        switch_task_namespaces(p, NULL);
 }
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+        const struct proc_ns_operations *ops;
+        struct task_struct *tsk = current;
+        struct nsproxy *new_nsproxy;
+        struct proc_inode *ei;
+        struct file *file;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        file = proc_ns_fget(fd);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        err = -EINVAL;
+        ei = PROC_I(file->f_dentry->d_inode);
+        ops = ei->ns_ops;
+        if (nstype && (ops->type != nstype))
+                goto out;
+        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+        if (IS_ERR(new_nsproxy)) {
+                err = PTR_ERR(new_nsproxy);
+                goto out;
+        }
+        err = ops->install(new_nsproxy, ei->ns);
+        if (err) {
+                free_nsproxy(new_nsproxy);
+                goto out;
+        }
+        switch_task_namespaces(tsk, new_nsproxy);
+out:
+        fput(file);
+        return err;
+}
 static int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
        put_user_ns(ns->user_ns);
        kfree(ns);
 }
+static void *utsns_get(struct task_struct *task)
+{
+        struct uts_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy) {
+                ns = nsproxy->uts_ns;
+                get_uts_ns(ns);
+        }
+        rcu_read_unlock();
+        return ns;
+}
+static void utsns_put(void *ns)
+{
+        put_uts_ns(ns);
+}
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+        get_uts_ns(ns);
+        put_uts_ns(nsproxy->uts_ns);
+        nsproxy->uts_ns = ns;
+        return 0;
+}
+const struct proc_ns_operations utsns_operations = {
+        .name           = "uts",
+        .type           = CLONE_NEWUTS,
+        .get            = utsns_get,
+        .put            = utsns_put,
+        .install        = utsns_install,
+};
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61f2ddd..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
        depends on !SMP
        bool
        default y
+config CLEANCACHE
+        bool "Enable cleancache driver to cache clean pages if tmem is present"
+        default n
+        help
+          Cleancache can be thought of as a page-granularity victim cache
+          for clean pages that the kernel's pageframe replacement algorithm
+          (PFRA) would like to keep around, but can't since there isn't enough
+          memory.  So when the PFRA "evicts" a page, it first attempts to use
+          cleancacne code to put the data contained in that page into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  And when a cleancache-enabled
+          filesystem wishes to access a page in a file on disk, it first
+          checks cleancache to see if it already contains it; if it does,
+          the page is copied into the kernel and a disk access is avoided.
+          When a transcendent memory driver is available (such as zcache or
+          Xen transcendent memory), a significant I/O reduction
+          may be achieved.  When none is available, all cleancache calls
+          are reduced to a single pointer-compare-against-NULL resulting
+          in a negligible performance hit.
+          If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326c3e3d..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache.  See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+        struct cleancache_ops old = cleancache_ops;
+        cleancache_ops = *ops;
+        cleancache_enabled = 1;
+        return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+        sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+        sb->cleancache_poolid =
+                (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+                              struct cleancache_filekey *key)
+{
+        int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+        int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+        struct super_block *sb = inode->i_sb;
+        key->u.ino = inode->i_ino;
+        if (sb->s_export_op != NULL) {
+                fhfn = sb->s_export_op->encode_fh;
+                if  (fhfn) {
+                        struct dentry d;
+                        d.d_inode = inode;
+                        len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+                        if (len <= 0 || len == 255)
+                                return -1;
+                        if (maxlen > CLEANCACHE_KEY_MAX)
+                                return -1;
+                }
+        }
+        return 0;
+}
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+        int ret = -1;
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id < 0)
+                goto out;
+        if (cleancache_get_key(page->mapping->host, &key) < 0)
+                goto out;
+        ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+        if (ret == 0)
+                cleancache_succ_gets++;
+        else
+                cleancache_failed_gets++;
+out:
+        return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index.  Page must be locked.  Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id >= 0 &&
+              cleancache_get_key(page->mapping->host, &key) >= 0) {
+                (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+                cleancache_puts++;
+        }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+        /* careful... page->mapping is NULL sometimes when this is called */
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0) {
+                VM_BUG_ON(!PageLocked(page));
+                if (cleancache_get_key(mapping->host, &key) >= 0) {
+                        (*cleancache_ops.flush_page)(pool_id, key, page->index);
+                        cleancache_flushes++;
+                }
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+                (*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+        if (sb->cleancache_poolid >= 0) {
+                int old_poolid = sb->cleancache_poolid;
+                sb->cleancache_poolid = -1;
+                (*cleancache_ops.flush_fs)(old_poolid);
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+#ifdef CONFIG_SYSFS
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+#define CLEANCACHE_SYSFS_RO(_name) \
+        static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+                                struct kobj_attribute *attr, char *buf) \
+        { \
+                return sprintf(buf, "%lu\n", cleancache_##_name); \
+        } \
+        static struct kobj_attribute cleancache_##_name##_attr = { \
+                .attr = { .name = __stringify(_name), .mode = 0444 }, \
+                .show = cleancache_##_name##_show, \
+        }
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+static struct attribute *cleancache_attrs[] = {
+        &cleancache_succ_gets_attr.attr,
+        &cleancache_failed_gets_attr.attr,
+        &cleancache_puts_attr.attr,
+        &cleancache_flushes_attr.attr,
+        NULL,
+};
+static struct attribute_group cleancache_attr_group = {
+        .attrs = cleancache_attrs,
+        .name = "cleancache",
+};
+#endif /* CONFIG_SYSFS */
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+        int err;
+        err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+        return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index 68e782b3d3de..7455ccd8bda8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 /*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        /*
+         * if we're uptodate, flush out into the cleancache, otherwise
+         * invalidate any existing cleancache entries.  We can't leave
+         * stale data around in the cleancache once our page is gone
+         */
+        if (PageUptodate(page) && PageMappedToDisk(page))
+                cleancache_put_page(page);
+        else
+                cleancache_flush_page(mapping, page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f4123056e06..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                /*
                 * drop PG_Mlocked flag for over-mapped range
                 */
-                unsigned int saved_flags = vma->vm_flags;
+                vm_flags_t saved_flags = vma->vm_flags;
                munlock_vma_pages_range(vma, start, start + size);
                vma->vm_flags = saved_flags;
        }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5fd68b95c671..f33bb319b73f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
                                        struct vm_area_struct *vma,
-                                        int acctflag)
+                                        vm_flags_t vm_flags)
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         * attempt will be made for VM_NORESERVE to allocate a page
         * and filesystem quota without using reserves
         */
-        if (acctflag & VM_NORESERVE)
+        if (vm_flags & VM_NORESERVE)
                return 0;
        /*
diff --git a/mm/memory.c b/mm/memory.c
index b73f677f0bb1..fc24f7d788bd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2ddd5a..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 * For vmas that pass the filters, merge/split as appropriate.
 */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-        unsigned long start, unsigned long end, unsigned int newflags)
+        unsigned long start, unsigned long end, vm_flags_t newflags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgoff_t pgoff;
        int nr_pages;
        int ret = 0;
-        int lock = newflags & VM_LOCKED;
+        int lock = !!(newflags & VM_LOCKED);
        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
                prev = vma;
        for (nstart = start ; ; ) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
                goto out;
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                newflags = vma->vm_flags | VM_LOCKED;
                if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index ac2631b7477f..bbdc9af5e117 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
-        unsigned int vm_flags;
+        vm_flags_t vm_flags;
        int error;
        unsigned long reqprot = prot;
@@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-        unsigned int vm_flags = vma->vm_flags;
+        vm_flags_t vm_flags = vma->vm_flags;
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
        /*
         * hugetlb has its own accounting separate from the core VM
@@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                          unsigned int vm_flags, unsigned long pgoff)
+                          vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
diff --git a/mm/slub.c b/mm/slub.c
index 4aad32d2e60d..7be0223531b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1831,7 +1831,6 @@ load_freelist:
        page->inuse = page->objects;
        page->freelist = NULL;
-unlock_out:
        slab_unlock(page);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index a95667529135..3a29a6180212 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+        cleancache_flush_page(page->mapping, page);
        if (page_has_private(page))
                do_invalidatepage(page, partial);
 }
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t next;
        int i;
+        cleancache_flush_inode(mapping);
        if (mapping->nrpages == 0)
                return;
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
        }
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int did_range_unmap = 0;
        int wrapped = 0;
+        cleancache_flush_inode(mapping);
        pagevec_init(&pvec, 0);
        next = start;
        while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                mem_cgroup_uncharge_end();
                cond_resched();
        }
+        cleancache_flush_inode(mapping);
        return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2e2dce6583e1..6c6b86d0da15 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,6 +8,8 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -302,6 +304,28 @@ void __put_net(struct net *net)
 }
 EXPORT_SYMBOL_GPL(__put_net);
+struct net *get_net_ns_by_fd(int fd)
+{
+        struct proc_inode *ei;
+        struct file *file;
+        struct net *net;
+        net = ERR_PTR(-EINVAL);
+        file = proc_ns_fget(fd);
+        if (!file)
+                goto out;
+        ei = PROC_I(file->f_dentry->d_inode);
+        if (ei->ns_ops != &netns_operations)
+                goto out;
+        net = get_net(ei->ns);
+out:
+        if (file)
+                fput(file);
+        return net;
+}
 #else
 struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 {
@@ -309,6 +333,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
                return ERR_PTR(-EINVAL);
        return old_net;
 }
+struct net *get_net_ns_by_fd(int fd)
+{
+        return ERR_PTR(-EINVAL);
+}
 #endif
 struct net *get_net_ns_by_pid(pid_t pid)
@@ -561,3 +590,39 @@ void unregister_pernet_device(struct pernet_operations *ops)
        mutex_unlock(&net_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
+#ifdef CONFIG_NET_NS
+static void *netns_get(struct task_struct *task)
+{
+        struct net *net = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy)
+                net = get_net(nsproxy->net_ns);
+        rcu_read_unlock();
+        return net;
+}
+static void netns_put(void *ns)
+{
+        put_net(ns);
+}
+static int netns_install(struct nsproxy *nsproxy, void *ns)
+{
+        put_net(nsproxy->net_ns);
+        nsproxy->net_ns = get_net(ns);
+        return 0;
+}
+const struct proc_ns_operations netns_operations = {
+        .name           = "net",
+        .type           = CLONE_NEWNET,
+        .get            = netns_get,
+        .put            = netns_put,
+        .install        = netns_install,
+};
+#endif
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2d56cb9b0b94..abd936d8a716 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1046,6 +1046,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
        [IFLA_LINKMODE]         = { .type = NLA_U8 },
        [IFLA_LINKINFO]         = { .type = NLA_NESTED },
        [IFLA_NET_NS_PID]       = { .type = NLA_U32 },
+        [IFLA_NET_NS_FD]        = { .type = NLA_U32 },
        [IFLA_IFALIAS]          = { .type = NLA_STRING, .len = IFALIASZ-1 },
        [IFLA_VFINFO_LIST]      = {. type = NLA_NESTED },
        [IFLA_VF_PORTS]         = { .type = NLA_NESTED },
@@ -1094,6 +1095,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
         */
        if (tb[IFLA_NET_NS_PID])
                net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+        else if (tb[IFLA_NET_NS_FD])
+                net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
        else
                net = get_net(src_net);
        return net;
@@ -1224,7 +1227,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
        int send_addr_notify = 0;
        int err;
-        if (tb[IFLA_NET_NS_PID]) {
+        if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
                struct net *net = rtnl_link_get_net(dev_net(dev), tb);
                if (IS_ERR(net)) {
                        err = PTR_ERR(net);