157 files changed, 4790 insertions, 2910 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 51c94e26a346..e777961939f3 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
        BUG_ON(!vcookie->fscache);
-        if (PageFsCache(page)) {
+        return fscache_maybe_release_page(vcookie->fscache, page, gfp);
-                if (fscache_check_page_write(vcookie->fscache, page)) {
-                        if (!(gfp & __GFP_WAIT))
-                                return 0;
-                        fscache_wait_on_page_write(vcookie->fscache, page);
-                }
-                fscache_uncache_page(vcookie->fscache, page);
-                ClearPageFsCache(page);
-        }
-        return 1;
 }
 void __v9fs_fscache_invalidate_page(struct page *page)
@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
                fscache_wait_on_page_write(vcookie->fscache, page);
                BUG_ON(!PageLocked(page));
                fscache_uncache_page(vcookie->fscache, page);
-                ClearPageFsCache(page);
        }
 }
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 681c2a7b013f..39b301662f22 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
                        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
                        fscache_wait_on_page_write(vnode->cache, page);
                        fscache_uncache_page(vnode->cache, page);
-                        ClearPageFsCache(page);
                }
 #endif
@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
        /* deny if page is being written to the cache and the caller hasn't
         * elected to wait */
 #ifdef CONFIG_AFS_FSCACHE
-        if (PageFsCache(page)) {
+        if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
-                if (fscache_check_page_write(vnode->cache, page)) {
+                _leave(" = F [cache busy]");
-                        if (!(gfp_flags & __GFP_WAIT)) {
+                return 0;
-                                _leave(" = F [cache busy]");
-                                return 0;
-                        }
-                        fscache_wait_on_page_write(vnode->cache, page);
-                }
-                fscache_uncache_page(vnode->cache, page);
-                ClearPageFsCache(page);
        }
 #endif
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 #include <linux/uio.h>
 #define DEBUG 0
@@ -32,6 +33,9 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
+#define AIO_BATCH_HASH_BITS     3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE     (1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+        struct hlist_node list;
+        struct address_space *mapping;
+};
+mempool_t *abe_pool;
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        aio_wq = create_workqueue("aio");
+        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+        BUG_ON(!abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
        return 1;
 }
+static void aio_batch_add(struct address_space *mapping,
+                          struct hlist_head *batch_hash)
+{
+        struct aio_batch_entry *abe;
+        struct hlist_node *pos;
+        unsigned bucket;
+        bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+        hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+                if (abe->mapping == mapping)
+                        return;
+        }
+        abe = mempool_alloc(abe_pool, GFP_KERNEL);
+        BUG_ON(!igrab(mapping->host));
+        abe->mapping = mapping;
+        hlist_add_head(&abe->list, &batch_hash[bucket]);
+        return;
+}
+static void aio_batch_free(struct hlist_head *batch_hash)
+{
+        struct aio_batch_entry *abe;
+        struct hlist_node *pos, *n;
+        int i;
+        for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+                hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+                        blk_run_address_space(abe->mapping);
+                        iput(abe->mapping->host);
+                        hlist_del(&abe->list);
+                        mempool_free(abe, abe_pool);
+                }
+        }
+}
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb)
+                         struct iocb *iocb, struct hlist_head *batch_hash)
 {
        struct kiocb *req;
        struct file *file;
@@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        ;
        }
        spin_unlock_irq(&ctx->ctx_lock);
+        if (req->ki_opcode == IOCB_CMD_PREAD ||
+            req->ki_opcode == IOCB_CMD_PREADV ||
+            req->ki_opcode == IOCB_CMD_PWRITE ||
+            req->ki_opcode == IOCB_CMD_PWRITEV)
+                aio_batch_add(file->f_mapping, batch_hash);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        struct kioctx *ctx;
        long ret = 0;
        int i;
+        struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
                if (ret)
                        break;
        }
+        aio_batch_free(batch_hash);
        put_ioctx(ctx);
        return i ? i : ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..d15ea1790bfb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -767,7 +767,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        
        current->mm->start_stack = bprm->p;
-        /* Now we do a little grungy work by mmaping the ELF image into
+        /* Now we do a little grungy work by mmapping the ELF image into
           the correct location in memory. */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..76e6713abf94 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(bio_init);
 *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
 *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   Note that the caller must set ->bi_destructor on succesful return
+ *   Note that the caller must set ->bi_destructor on successful return
 *   of a bio, to do the appropriate freeing of the bio once the reference
 *   count drops to zero.
 **/
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
        }
 }
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+        int i;
+        struct bio_vec *bvec;
+        bio_for_each_segment(bvec, bi, i)
+                flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
 /**
 * bio_endio - end I/O on a bio
 * @bio:        bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..73d6a735b8f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        return sync_blockdev(I_BDEV(filp->f_mapping->host));
+        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        int error;
+        error = sync_blockdev(bdev);
+        if (error)
+                return error;
+        
+        error = blkdev_issue_flush(bdev, NULL);
+        if (error == -EOPNOTSUPP)
+                error = 0;
+        return error;
 }
 /*
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ccbdcb54ec5d..46bea0f4dc7b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -256,7 +256,7 @@ out:
 * Insert @em into @tree or perform a simple forward/backward merge with
 * existing mappings.  The extent_map struct passed in will be inserted
 * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was sucessfull.
+ * reference dropped if the merge attempt was successfull.
 */
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 431accd475a7..27089311fbea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -114,8 +114,9 @@ nomem_lookup_data:
 /*
 * attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
 */
-static void cachefiles_lookup_object(struct fscache_object *_object)
+static int cachefiles_lookup_object(struct fscache_object *_object)
 {
        struct cachefiles_lookup_data *lookup_data;
        struct cachefiles_object *parent, *object;
@@ -145,13 +146,15 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
            object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
                cachefiles_attr_changed(&object->fscache);
-        if (ret < 0) {
+        if (ret < 0 && ret != -ETIMEDOUT) {
-                printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+                if (ret != -ENOBUFS)
-                       ret);
+                        printk(KERN_WARNING
+                               "CacheFiles: Lookup failed error %d\n", ret);
                fscache_object_lookup_error(&object->fscache);
        }
        _leave(" [%d]", ret);
+        return ret;
 }
 /*
@@ -331,6 +334,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
                }
                cache = object->fscache.cache;
+                fscache_object_destroy(&object->fscache);
                kmem_cache_free(cachefiles_object_jar, object);
                fscache_object_destroyed(cache);
        }
@@ -403,12 +407,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
        if (oi_size == ni_size)
                return 0;
-        newattrs.ia_size = ni_size;
-        newattrs.ia_valid = ATTR_SIZE;
        cachefiles_begin_secure(cache, &saved_cred);
        mutex_lock(&object->backer->d_inode->i_mutex);
+        /* if there's an extension to a partial page at the end of the backing
+         * file, we need to discard the partial page so that we pick up new
+         * data after it */
+        if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+                _debug("discard tail %llx", oi_size);
+                newattrs.ia_valid = ATTR_SIZE;
+                newattrs.ia_size = oi_size & PAGE_MASK;
+                ret = notify_change(object->backer, &newattrs);
+                if (ret < 0)
+                        goto truncate_failed;
+        }
+        newattrs.ia_valid = ATTR_SIZE;
+        newattrs.ia_size = ni_size;
        ret = notify_change(object->backer, &newattrs);
+truncate_failed:
        mutex_unlock(&object->backer->d_inode->i_mutex);
        cachefiles_end_secure(cache, saved_cred);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 4ce818ae39ea..14ac4806e291 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -21,17 +21,81 @@
 #include <linux/security.h>
 #include "internal.h"
-static int cachefiles_wait_bit(void *flags)
+#define CACHEFILES_KEYBUF_SIZE 512
+/*
+ * dump debugging info about an object
+ */
+static noinline
+void __cachefiles_printk_object(struct cachefiles_object *object,
+                                const char *prefix,
+                                u8 *keybuf)
 {
-        schedule();
+        struct fscache_cookie *cookie;
-        return 0;
+        unsigned keylen, loop;
+        printk(KERN_ERR "%sobject: OBJ%x\n",
+               prefix, object->fscache.debug_id);
+        printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+               prefix, fscache_object_states[object->fscache.state],
+               object->fscache.flags, object->fscache.work.flags,
+               object->fscache.events,
+               object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+               prefix, object->fscache.n_ops, object->fscache.n_in_progress,
+               object->fscache.n_exclusive);
+        printk(KERN_ERR "%sparent=%p\n",
+               prefix, object->fscache.parent);
+        spin_lock(&object->fscache.lock);
+        cookie = object->fscache.cookie;
+        if (cookie) {
+                printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+                       prefix,
+                       object->fscache.cookie,
+                       object->fscache.cookie->parent,
+                       object->fscache.cookie->netfs_data,
+                       object->fscache.cookie->flags);
+                if (keybuf)
+                        keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
+                                                      CACHEFILES_KEYBUF_SIZE);
+                else
+                        keylen = 0;
+        } else {
+                printk(KERN_ERR "%scookie=NULL\n", prefix);
+                keylen = 0;
+        }
+        spin_unlock(&object->fscache.lock);
+        if (keylen) {
+                printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+                for (loop = 0; loop < keylen; loop++)
+                        printk("%02x", keybuf[loop]);
+                printk("'\n");
+        }
+}
+/*
+ * dump debugging info about a pair of objects
+ */
+static noinline void cachefiles_printk_object(struct cachefiles_object *object,
+                                              struct cachefiles_object *xobject)
+{
+        u8 *keybuf;
+        keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
+        if (object)
+                __cachefiles_printk_object(object, "", keybuf);
+        if (xobject)
+                __cachefiles_printk_object(xobject, "x", keybuf);
+        kfree(keybuf);
 }
 /*
 * record the fact that an object is now active
 */
-static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
-                                          struct cachefiles_object *object)
+                                         struct cachefiles_object *object)
 {
        struct cachefiles_object *xobject;
        struct rb_node **_p, *_parent = NULL;
@@ -42,8 +106,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
 try_again:
        write_lock(&cache->active_lock);
-        if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+        if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+                printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+                cachefiles_printk_object(object, NULL);
                BUG();
+        }
        dentry = object->dentry;
        _p = &cache->active_nodes.rb_node;
@@ -66,8 +133,8 @@ try_again:
        rb_insert_color(&object->active_node, &cache->active_nodes);
        write_unlock(&cache->active_lock);
-        _leave("");
+        _leave(" = 0");
-        return;
+        return 0;
        /* an old object from a previous incarnation is hogging the slot - we
         * need to wait for it to be destroyed */
@@ -76,44 +143,70 @@ wait_for_old_object:
                printk(KERN_ERR "\n");
                printk(KERN_ERR "CacheFiles: Error:"
                       " Unexpected object collision\n");
-                printk(KERN_ERR "xobject: OBJ%x\n",
+                cachefiles_printk_object(object, xobject);
-                       xobject->fscache.debug_id);
-                printk(KERN_ERR "xobjstate=%s\n",
-                       fscache_object_states[xobject->fscache.state]);
-                printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
-                printk(KERN_ERR "xobjevent=%lx [%lx]\n",
-                       xobject->fscache.events, xobject->fscache.event_mask);
-                printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
-                       xobject->fscache.n_ops, xobject->fscache.n_in_progress,
-                       xobject->fscache.n_exclusive);
-                printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
-                       xobject->fscache.cookie,
-                       xobject->fscache.cookie->parent,
-                       xobject->fscache.cookie->netfs_data,
-                       xobject->fscache.cookie->flags);
-                printk(KERN_ERR "xparent=%p\n",
-                       xobject->fscache.parent);
-                printk(KERN_ERR "object: OBJ%x\n",
-                       object->fscache.debug_id);
-                printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
-                       object->fscache.cookie,
-                       object->fscache.cookie->parent,
-                       object->fscache.cookie->netfs_data,
-                       object->fscache.cookie->flags);
-                printk(KERN_ERR "parent=%p\n",
-                       object->fscache.parent);
                BUG();
        }
        atomic_inc(&xobject->usage);
        write_unlock(&cache->active_lock);
-        _debug(">>> wait");
+        if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
-        wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+                wait_queue_head_t *wq;
-                    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
-        _debug("<<< waited");
+                signed long timeout = 60 * HZ;
+                wait_queue_t wait;
+                bool requeue;
+                /* if the object we're waiting for is queued for processing,
+                 * then just put ourselves on the queue behind it */
+                if (slow_work_is_queued(&xobject->fscache.work)) {
+                        _debug("queue OBJ%x behind OBJ%x immediately",
+                               object->fscache.debug_id,
+                               xobject->fscache.debug_id);
+                        goto requeue;
+                }
+                /* otherwise we sleep until either the object we're waiting for
+                 * is done, or the slow-work facility wants the thread back to
+                 * do other work */
+                wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+                init_wait(&wait);
+                requeue = false;
+                do {
+                        prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                        if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+                                break;
+                        requeue = slow_work_sleep_till_thread_needed(
+                                &object->fscache.work, &timeout);
+                } while (timeout > 0 && !requeue);
+                finish_wait(wq, &wait);
+                if (requeue &&
+                    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+                        _debug("queue OBJ%x behind OBJ%x after wait",
+                               object->fscache.debug_id,
+                               xobject->fscache.debug_id);
+                        goto requeue;
+                }
+                if (timeout <= 0) {
+                        printk(KERN_ERR "\n");
+                        printk(KERN_ERR "CacheFiles: Error: Overlong"
+                               " wait for old active object to go away\n");
+                        cachefiles_printk_object(object, xobject);
+                        goto requeue;
+                }
+        }
+        ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
        cache->cache.ops->put_object(&xobject->fscache);
        goto try_again;
+requeue:
+        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+        cache->cache.ops->put_object(&xobject->fscache);
+        _leave(" = -ETIMEDOUT");
+        return -ETIMEDOUT;
 }
 /*
@@ -254,7 +347,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        dir = dget_parent(object->dentry);
-        mutex_lock(&dir->d_inode->i_mutex);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        ret = cachefiles_bury_object(cache, dir, object->dentry);
        dput(dir);
@@ -307,7 +400,7 @@ lookup_again:
        /* search the current directory for the element name */
        _debug("lookup '%s'", name);
-        mutex_lock(&dir->d_inode->i_mutex);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
        start = jiffies;
        next = lookup_one_len(name, dir, nlen);
@@ -418,12 +511,15 @@ lookup_again:
        }
        /* note that we're now using this object */
-        cachefiles_mark_object_active(cache, object);
+        ret = cachefiles_mark_object_active(cache, object);
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(dir);
        dir = NULL;
+        if (ret == -ETIMEDOUT)
+                goto mark_active_timed_out;
        _debug("=== OBTAINED_OBJECT ===");
        if (object->new) {
@@ -467,6 +563,10 @@ create_error:
                cachefiles_io_error(cache, "Create/mkdir failed");
        goto error;
+mark_active_timed_out:
+        _debug("mark active timed out");
+        goto release_dentry;
 check_error:
        _debug("check error %d", ret);
        write_lock(&cache->active_lock);
@@ -474,7 +574,7 @@ check_error:
        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
        wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
        write_unlock(&cache->active_lock);
+release_dentry:
        dput(object->dentry);
        object->dentry = NULL;
        goto error_out;
@@ -495,9 +595,6 @@ error:
 error_out2:
        dput(dir);
 error_out:
-        if (ret == -ENOSPC)
-                ret = -ENOBUFS;
        _leave(" = error %d", -ret);
        return ret;
 }
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a69787e7dd96..a6c8c6fe8df9 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/file.h>
+#include <linux/ima.h>
 #include "internal.h"
 /*
@@ -40,8 +41,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
        _debug("--- monitor %p %lx ---", page, page->flags);
-        if (!PageUptodate(page) && !PageError(page))
+        if (!PageUptodate(page) && !PageError(page)) {
-                dump_stack();
+                /* unlocked, not uptodate and not erronous? */
+                _debug("page probably truncated");
+        }
        /* remove from the waitqueue */
        list_del(&wait->task_list);
@@ -61,6 +64,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
 }
 /*
+ * handle a probably truncated page
+ * - check to see if the page is still relevant and reissue the read if
+ *   possible
+ * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
+ *   must wait again and 0 if successful
+ */
+static int cachefiles_read_reissue(struct cachefiles_object *object,
+                                   struct cachefiles_one_read *monitor)
+{
+        struct address_space *bmapping = object->backer->d_inode->i_mapping;
+        struct page *backpage = monitor->back_page, *backpage2;
+        int ret;
+        kenter("{ino=%lx},{%lx,%lx}",
+               object->backer->d_inode->i_ino,
+               backpage->index, backpage->flags);
+        /* skip if the page was truncated away completely */
+        if (backpage->mapping != bmapping) {
+                kleave(" = -ENODATA [mapping]");
+                return -ENODATA;
+        }
+        backpage2 = find_get_page(bmapping, backpage->index);
+        if (!backpage2) {
+                kleave(" = -ENODATA [gone]");
+                return -ENODATA;
+        }
+        if (backpage != backpage2) {
+                put_page(backpage2);
+                kleave(" = -ENODATA [different]");
+                return -ENODATA;
+        }
+        /* the page is still there and we already have a ref on it, so we don't
+         * need a second */
+        put_page(backpage2);
+        INIT_LIST_HEAD(&monitor->op_link);
+        add_page_wait_queue(backpage, &monitor->monitor);
+        if (trylock_page(backpage)) {
+                ret = -EIO;
+                if (PageError(backpage))
+                        goto unlock_discard;
+                ret = 0;
+                if (PageUptodate(backpage))
+                        goto unlock_discard;
+                kdebug("reissue read");
+                ret = bmapping->a_ops->readpage(NULL, backpage);
+                if (ret < 0)
+                        goto unlock_discard;
+        }
+        /* but the page may have been read before the monitor was installed, so
+         * the monitor may miss the event - so we have to ensure that we do get
+         * one in such a case */
+        if (trylock_page(backpage)) {
+                _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+                unlock_page(backpage);
+        }
+        /* it'll reappear on the todo list */
+        kleave(" = -EINPROGRESS");
+        return -EINPROGRESS;
+unlock_discard:
+        unlock_page(backpage);
+        spin_lock_irq(&object->work_lock);
+        list_del(&monitor->op_link);
+        spin_unlock_irq(&object->work_lock);
+        kleave(" = %d", ret);
+        return ret;
+}
+/*
 * copy data from backing pages to netfs pages to complete a read operation
 * - driven by FS-Cache's thread pool
 */
@@ -92,20 +173,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
                _debug("- copy {%lu}", monitor->back_page->index);
-                error = -EIO;
+        recheck:
                if (PageUptodate(monitor->back_page)) {
                        copy_highpage(monitor->netfs_page, monitor->back_page);
                        pagevec_add(&pagevec, monitor->netfs_page);
                        fscache_mark_pages_cached(monitor->op, &pagevec);
                        error = 0;
-                }
+                } else if (!PageError(monitor->back_page)) {
+                        /* the page has probably been truncated */
-                if (error)
+                        error = cachefiles_read_reissue(object, monitor);
+                        if (error == -EINPROGRESS)
+                                goto next;
+                        goto recheck;
+                } else {
                        cachefiles_io_error_obj(
                                object,
                                "Readpage failed on backing file %lx",
                                (unsigned long) monitor->back_page->flags);
+                        error = -EIO;
+                }
                page_cache_release(monitor->back_page);
@@ -114,6 +201,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
                fscache_put_retrieval(op);
                kfree(monitor);
+        next:
                /* let the thread pool have some air occasionally */
                max--;
                if (max < 0 || need_resched()) {
@@ -333,7 +421,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
-        op->op.flags = FSCACHE_OP_FAST;
+        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+        op->op.flags |= FSCACHE_OP_FAST;
        op->op.processor = cachefiles_read_copier;
        pagevec_init(&pagevec, 0);
@@ -639,7 +728,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
        pagevec_init(&pagevec, 0);
-        op->op.flags = FSCACHE_OP_FAST;
+        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+        op->op.flags |= FSCACHE_OP_FAST;
        op->op.processor = cachefiles_read_copier;
        INIT_LIST_HEAD(&backpages);
@@ -801,7 +891,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
        struct cachefiles_cache *cache;
        mm_segment_t old_fs;
        struct file *file;
-        loff_t pos;
+        loff_t pos, eof;
+        size_t len;
        void *data;
        int ret;
@@ -832,18 +923,33 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
        } else {
+                ima_counts_get(file);
                ret = -EIO;
                if (file->f_op->write) {
                        pos = (loff_t) page->index << PAGE_SHIFT;
+                        /* we mustn't write more data than we have, so we have
+                         * to beware of a partial page at EOF */
+                        eof = object->fscache.store_limit_l;
+                        len = PAGE_SIZE;
+                        if (eof & ~PAGE_MASK) {
+                                ASSERTCMP(pos, <, eof);
+                                if (eof - pos < PAGE_SIZE) {
+                                        _debug("cut short %llx to %llx",
+                                               pos, eof);
+                                        len = eof - pos;
+                                        ASSERTCMP(pos + len, ==, eof);
+                                }
+                        }
                        data = kmap(page);
                        old_fs = get_fs();
                        set_fs(KERNEL_DS);
                        ret = file->f_op->write(
-                                file, (const void __user *) data, PAGE_SIZE,
+                                file, (const void __user *) data, len, &pos);
-                                &pos);
                        set_fs(old_fs);
                        kunmap(page);
-                        if (ret != PAGE_SIZE)
+                        if (ret != len)
                                ret = -EIO;
                }
                fput(file);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 145540a316ab..094ea65afc85 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works).
 Version 1.60
 -------------
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
                source name to use to represent the client netbios machine 
                name when doing the RFC1001 netbios session initialize.
  direct        Do not do inode data caching on files opened on this mount.
-                This precludes mmaping files on this mount. In some cases
+                This precludes mmapping files on this mount. In some cases
                with fast networks and little or no caching benefits on the
                client (e.g. when the application is doing large sequential
                reads bigger than page size without rereading the same data) 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9a5e4f5f3122..29f1da761bbf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1037,7 +1037,7 @@ init_cifs(void)
        if (rc)
                goto out_unregister_key_type;
 #endif
-        rc = slow_work_register_user();
+        rc = slow_work_register_user(THIS_MODULE);
        if (rc)
                goto out_unregister_resolver_key;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..4b35f7ec0583 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -39,7 +39,7 @@
 /*
 * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurently. It also matches the most common
+ * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
 * eventually want to use the negotiated value (in case
 * future servers can handle more) when we are more confident that
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a842..3877737f96a6 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
 /* empty wct response to setattr */
 /*******************************************************/
-/* NT Transact structure defintions follow             */
+/* NT Transact structure definitions follow            */
 /* Currently only ioctl, acl (get security descriptor) */
 /* and notify are implemented                          */
 /*******************************************************/
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 627a60a6c1b1..1f42f772865a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,8 +214,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                posix_flags |= SMB_O_EXCL;
        if (oflags & O_TRUNC)
                posix_flags |= SMB_O_TRUNC;
-        if (oflags & O_APPEND)
-                posix_flags |= SMB_O_APPEND;
        if (oflags & O_SYNC)
                posix_flags |= SMB_O_SYNC;
        if (oflags & O_DIRECTORY)
@@ -643,9 +641,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
         * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
         * the VFS handle the create.
         */
-        if (nd->flags & LOOKUP_EXCL) {
+        if (nd && (nd->flags & LOOKUP_EXCL)) {
                d_instantiate(direntry, NULL);
-                return 0;
+                return NULL;
        }
        /* can not grab the rename sem here since it would
@@ -675,7 +673,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
         * reduction in network traffic in the other paths.
         */
        if (pTcon->unix_ext) {
-                if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+                if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
                     (nd->intent.open.flags & O_CREAT)) {
                        rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..cf18ee765590 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -914,8 +914,8 @@ undo_setattr:
 /*
 * If dentry->d_inode is null (usually meaning the cached dentry
 * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACESS
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
- * but will return the EACESS to the caller.  Note that the VFS does not call
+ * but will return the EACCESS to the caller. Note that the VFS does not call
 * unlink on negative dentries currently.
 */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
        smbhash(p24 + 16, c8, p21 + 14, 1);
 }
-#if 0 /* currently unsued */
+#if 0 /* currently unused */
 static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 43c96ce29614..c6405ce3c50e 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -17,28 +17,25 @@ static struct ctl_table_header *fs_table_header;
 static ctl_table coda_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "timeout",
                .data           = &coda_timeout,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hard",
                .data           = &coda_hard,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "fake_statfs",
                .data           = &coda_fake_statfs,
                .maxlen         = sizeof(int),
                .mode           = 0600,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -46,7 +43,6 @@ static ctl_table coda_table[] = {
 #ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "coda",
                .mode           = 0555,
                .child          = coda_table
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e7058c298..2346895b3a77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -246,428 +246,6 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
        return err;
 }
-#ifdef CONFIG_NET
-static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timeval __user *up = compat_ptr(arg);
-        struct timeval ktv;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&ktv);
-        set_fs(old_fs);
-        if(!err) {
-                err = put_user(ktv.tv_sec, &up->tv_sec);
-                err |= __put_user(ktv.tv_usec, &up->tv_usec);
-        }
-        return err;
-}
-static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timespec __user *up = compat_ptr(arg);
-        struct timespec kts;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kts);
-        set_fs(old_fs);
-        if (!err) {
-                err = put_user(kts.tv_sec, &up->tv_sec);
-                err |= __put_user(kts.tv_nsec, &up->tv_nsec);
-        }
-        return err;
-}
-struct ifmap32 {
-        compat_ulong_t mem_start;
-        compat_ulong_t mem_end;
-        unsigned short base_addr;
-        unsigned char irq;
-        unsigned char dma;
-        unsigned char port;
-};
-struct ifreq32 {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                compat_int_t     ifru_ivalue;
-                compat_int_t     ifru_mtu;
-                struct  ifmap32 ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
-                char    ifru_newname[IFNAMSIZ];
-                compat_caddr_t ifru_data;
-            /* XXXX? ifru_settings should be here */
-        } ifr_ifru;
-};
-struct ifconf32 {
-        compat_int_t    ifc_len;                        /* size of buffer       */
-        compat_caddr_t  ifcbuf;
-};
-static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *uifr;
-        int err;
-        uifr = compat_alloc_user_space(sizeof(struct ifreq));
-        if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
-                return -EFAULT;
-        err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
-        if (err)
-                return err;
-        if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
-                return -EFAULT;
-        return 0;
-}
-static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifconf32 ifc32;
-        struct ifconf ifc;
-        struct ifconf __user *uifc;
-        struct ifreq32 __user *ifr32;
-        struct ifreq __user *ifr;
-        unsigned int i, j;
-        int err;
-        if (copy_from_user(&ifc32, compat_ptr(arg), sizeof(struct ifconf32)))
-                return -EFAULT;
-        if (ifc32.ifcbuf == 0) {
-                ifc32.ifc_len = 0;
-                ifc.ifc_len = 0;
-                ifc.ifc_req = NULL;
-                uifc = compat_alloc_user_space(sizeof(struct ifconf));
-        } else {
-                size_t len =((ifc32.ifc_len / sizeof (struct ifreq32)) + 1) *
-                        sizeof (struct ifreq);
-                uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
-                ifc.ifc_len = len;
-                ifr = ifc.ifc_req = (void __user *)(uifc + 1);
-                ifr32 = compat_ptr(ifc32.ifcbuf);
-                for (i = 0; i < ifc32.ifc_len; i += sizeof (struct ifreq32)) {
-                        if (copy_in_user(ifr, ifr32, sizeof(struct ifreq32)))
-                                return -EFAULT;
-                        ifr++;
-                        ifr32++; 
-                }
-        }
-        if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
-                return -EFAULT;
-        err = sys_ioctl (fd, SIOCGIFCONF, (unsigned long)uifc); 
-        if (err)
-                return err;
-        if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) 
-                return -EFAULT;
-        ifr = ifc.ifc_req;
-        ifr32 = compat_ptr(ifc32.ifcbuf);
-        for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
-             i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
-                if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
-                        return -EFAULT;
-                ifr32++;
-                ifr++;
-        }
-        if (ifc32.ifcbuf == 0) {
-                /* Translate from 64-bit structure multiple to
-                 * a 32-bit one.
-                 */
-                i = ifc.ifc_len;
-                i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
-                ifc32.ifc_len = i;
-        } else {
-                ifc32.ifc_len = i;
-        }
-        if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
-                return -EFAULT;
-        return 0;
-}
-static int ethtool_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *ifr;
-        struct ifreq32 __user *ifr32;
-        u32 data;
-        void __user *datap;
-        
-        ifr = compat_alloc_user_space(sizeof(*ifr));
-        ifr32 = compat_ptr(arg);
-        if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                return -EFAULT;
-        if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &ifr->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) ifr);
-}
-static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq kifr;
-        struct ifreq __user *uifr;
-        struct ifreq32 __user *ifr32 = compat_ptr(arg);
-        mm_segment_t old_fs;
-        int err;
-        u32 data;
-        void __user *datap;
-        switch (cmd) {
-        case SIOCBONDENSLAVE:
-        case SIOCBONDRELEASE:
-        case SIOCBONDSETHWADDR:
-        case SIOCBONDCHANGEACTIVE:
-                if (copy_from_user(&kifr, ifr32, sizeof(struct ifreq32)))
-                        return -EFAULT;
-                old_fs = get_fs();
-                set_fs (KERNEL_DS);
-                err = sys_ioctl (fd, cmd, (unsigned long)&kifr);
-                set_fs (old_fs);
-                return err;
-        case SIOCBONDSLAVEINFOQUERY:
-        case SIOCBONDINFOQUERY:
-                uifr = compat_alloc_user_space(sizeof(*uifr));
-                if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                        return -EFAULT;
-                if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                datap = compat_ptr(data);
-                if (put_user(datap, &uifr->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                return sys_ioctl (fd, cmd, (unsigned long)uifr);
-        default:
-                return -EINVAL;
-        };
-}
-static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *u_ifreq64;
-        struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
-        char tmp_buf[IFNAMSIZ];
-        void __user *data64;
-        u32 data32;
-        if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
-                           IFNAMSIZ))
-                return -EFAULT;
-        if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        data64 = compat_ptr(data32);
-        u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
-        /* Don't check these user accesses, just let that get trapped
-         * in the ioctl handler instead.
-         */
-        if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
-                         IFNAMSIZ))
-                return -EFAULT;
-        if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) u_ifreq64);
-}
-static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq ifr;
-        struct ifreq32 __user *uifr32;
-        struct ifmap32 __user *uifmap32;
-        mm_segment_t old_fs;
-        int err;
-        
-        uifr32 = compat_ptr(arg);
-        uifmap32 = &uifr32->ifr_ifru.ifru_map;
-        switch (cmd) {
-        case SIOCSIFMAP:
-                err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
-                err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
-                err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
-                err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
-                if (err)
-                        return -EFAULT;
-                break;
-        case SIOCSHWTSTAMP:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
-                break;
-        default:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                break;
-        }
-        old_fs = get_fs();
-        set_fs (KERNEL_DS);
-        err = sys_ioctl (fd, cmd, (unsigned long)&ifr);
-        set_fs (old_fs);
-        if (!err) {
-                switch (cmd) {
-                /* TUNSETIFF is defined as _IOW, it should be _IORW
-                 * as the data is copied back to user space, but that
-                 * cannot be fixed without breaking all existing apps.
-                 */
-                case TUNSETIFF:
-                case TUNGETIFF:
-                case SIOCGIFFLAGS:
-                case SIOCGIFMETRIC:
-                case SIOCGIFMTU:
-                case SIOCGIFMEM:
-                case SIOCGIFHWADDR:
-                case SIOCGIFINDEX:
-                case SIOCGIFADDR:
-                case SIOCGIFBRDADDR:
-                case SIOCGIFDSTADDR:
-                case SIOCGIFNETMASK:
-                case SIOCGIFTXQLEN:
-                        if (copy_to_user(uifr32, &ifr, sizeof(*uifr32)))
-                                return -EFAULT;
-                        break;
-                case SIOCGIFMAP:
-                        err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-                        err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                        err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                        err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                        err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
-                        err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
-                        err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
-                        if (err)
-                                err = -EFAULT;
-                        break;
-                }
-        }
-        return err;
-}
-struct rtentry32 {
-        u32             rt_pad1;
-        struct sockaddr rt_dst;         /* target address               */
-        struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-        struct sockaddr rt_genmask;     /* target network mask (IP)     */
-        unsigned short  rt_flags;
-        short           rt_pad2;
-        u32             rt_pad3;
-        unsigned char   rt_tos;
-        unsigned char   rt_class;
-        short           rt_pad4;
-        short           rt_metric;      /* +1 for binary compatibility! */
-        /* char * */ u32 rt_dev;        /* forcing the device at add    */
-        u32             rt_mtu;         /* per route MTU/Window         */
-        u32             rt_window;      /* Window clamping              */
-        unsigned short  rt_irtt;        /* Initial RTT                  */
-};
-struct in6_rtmsg32 {
-        struct in6_addr         rtmsg_dst;
-        struct in6_addr         rtmsg_src;
-        struct in6_addr         rtmsg_gateway;
-        u32                     rtmsg_type;
-        u16                     rtmsg_dst_len;
-        u16                     rtmsg_src_len;
-        u32                     rtmsg_metric;
-        u32                     rtmsg_info;
-        u32                     rtmsg_flags;
-        s32                     rtmsg_ifindex;
-};
-static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        void *r = NULL;
-        struct in6_rtmsg r6;
-        struct rtentry r4;
-        char devname[16];
-        u32 rtdev;
-        mm_segment_t old_fs = get_fs();
-        
-        struct socket *mysock = sockfd_lookup(fd, &ret);
-        if (mysock && mysock->sk && mysock->sk->sk_family == AF_INET6) { /* ipv6 */
-                struct in6_rtmsg32 __user *ur6 = compat_ptr(arg);
-                ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
-                        3 * sizeof(struct in6_addr));
-                ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
-                ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-                ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-                ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
-                ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
-                ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
-                ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
-                
-                r = (void *) &r6;
-        } else { /* ipv4 */
-                struct rtentry32 __user *ur4 = compat_ptr(arg);
-                ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
-                                        3 * sizeof(struct sockaddr));
-                ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
-                ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
-                ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
-                ret |= __get_user (r4.rt_window, &(ur4->rt_window));
-                ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
-                ret |= __get_user (rtdev, &(ur4->rt_dev));
-                if (rtdev) {
-                        ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
-                        r4.rt_dev = devname; devname[15] = 0;
-                } else
-                        r4.rt_dev = NULL;
-                r = (void *) &r4;
-        }
-        if (ret) {
-                ret = -EFAULT;
-                goto out;
-        }
-        set_fs (KERNEL_DS);
-        ret = sys_ioctl (fd, cmd, (unsigned long) r);
-        set_fs (old_fs);
-out:
-        if (mysock)
-                sockfd_put(mysock);
-        return ret;
-}
-#endif
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
        compat_int_t interface_id;      /* [i] 'S' for SCSI generic (required) */
@@ -1212,170 +790,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a
        return err;
 }
-struct atmif_sioc32 {
-        compat_int_t    number;
-        compat_int_t    length;
-        compat_caddr_t  arg;
-};
-struct atm_iobuf32 {
-        compat_int_t    length;
-        compat_caddr_t  buffer;
-};
-#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct atmif_sioc32)
-#define ATM_GETNAMES32    _IOW('a', ATMIOC_ITF+3, struct atm_iobuf32)
-#define ATM_GETTYPE32     _IOW('a', ATMIOC_ITF+4, struct atmif_sioc32)
-#define ATM_GETESI32      _IOW('a', ATMIOC_ITF+5, struct atmif_sioc32)
-#define ATM_GETADDR32     _IOW('a', ATMIOC_ITF+6, struct atmif_sioc32)
-#define ATM_RSTADDR32     _IOW('a', ATMIOC_ITF+7, struct atmif_sioc32)
-#define ATM_ADDADDR32     _IOW('a', ATMIOC_ITF+8, struct atmif_sioc32)
-#define ATM_DELADDR32     _IOW('a', ATMIOC_ITF+9, struct atmif_sioc32)
-#define ATM_GETCIRANGE32  _IOW('a', ATMIOC_ITF+10, struct atmif_sioc32)
-#define ATM_SETCIRANGE32  _IOW('a', ATMIOC_ITF+11, struct atmif_sioc32)
-#define ATM_SETESI32      _IOW('a', ATMIOC_ITF+12, struct atmif_sioc32)
-#define ATM_SETESIF32     _IOW('a', ATMIOC_ITF+13, struct atmif_sioc32)
-#define ATM_GETSTAT32     _IOW('a', ATMIOC_SARCOM+0, struct atmif_sioc32)
-#define ATM_GETSTATZ32    _IOW('a', ATMIOC_SARCOM+1, struct atmif_sioc32)
-#define ATM_GETLOOP32     _IOW('a', ATMIOC_SARCOM+2, struct atmif_sioc32)
-#define ATM_SETLOOP32     _IOW('a', ATMIOC_SARCOM+3, struct atmif_sioc32)
-#define ATM_QUERYLOOP32   _IOW('a', ATMIOC_SARCOM+4, struct atmif_sioc32)
-static struct {
-        unsigned int cmd32;
-        unsigned int cmd;
-} atm_ioctl_map[] = {
-        { ATM_GETLINKRATE32, ATM_GETLINKRATE },
-        { ATM_GETNAMES32,    ATM_GETNAMES },
-        { ATM_GETTYPE32,     ATM_GETTYPE },
-        { ATM_GETESI32,      ATM_GETESI },
-        { ATM_GETADDR32,     ATM_GETADDR },
-        { ATM_RSTADDR32,     ATM_RSTADDR },
-        { ATM_ADDADDR32,     ATM_ADDADDR },
-        { ATM_DELADDR32,     ATM_DELADDR },
-        { ATM_GETCIRANGE32,  ATM_GETCIRANGE },
-        { ATM_SETCIRANGE32,  ATM_SETCIRANGE },
-        { ATM_SETESI32,      ATM_SETESI },
-        { ATM_SETESIF32,     ATM_SETESIF },
-        { ATM_GETSTAT32,     ATM_GETSTAT },
-        { ATM_GETSTATZ32,    ATM_GETSTATZ },
-        { ATM_GETLOOP32,     ATM_GETLOOP },
-        { ATM_SETLOOP32,     ATM_SETLOOP },
-        { ATM_QUERYLOOP32,   ATM_QUERYLOOP }
-};
-#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
-static int do_atm_iobuf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atm_iobuf   __user *iobuf;
-        struct atm_iobuf32 __user *iobuf32;
-        u32 data;
-        void __user *datap;
-        int len, err;
-        iobuf = compat_alloc_user_space(sizeof(*iobuf));
-        iobuf32 = compat_ptr(arg);
-        if (get_user(len, &iobuf32->length) ||
-            get_user(data, &iobuf32->buffer))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(len, &iobuf->length) ||
-            put_user(datap, &iobuf->buffer))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long)iobuf);
-        if (!err) {
-                if (copy_in_user(&iobuf32->length, &iobuf->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
-        return err;
-}
-static int do_atmif_sioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atmif_sioc   __user *sioc;
-        struct atmif_sioc32 __user *sioc32;
-        u32 data;
-        void __user *datap;
-        int err;
-        
-        sioc = compat_alloc_user_space(sizeof(*sioc));
-        sioc32 = compat_ptr(arg);
-        if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
-            get_user(data, &sioc32->arg))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &sioc->arg))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long) sioc);
-        if (!err) {
-                if (copy_in_user(&sioc32->length, &sioc->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
-        return err;
-}
-static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
-{
-        int i;
-        unsigned int cmd = 0;
-        
-        switch (cmd32) {
-        case SONET_GETSTAT:
-        case SONET_GETSTATZ:
-        case SONET_GETDIAG:
-        case SONET_SETDIAG:
-        case SONET_CLRDIAG:
-        case SONET_SETFRAMING:
-        case SONET_GETFRAMING:
-        case SONET_GETFRSENSE:
-                return do_atmif_sioc(fd, cmd32, arg);
-        }
-        for (i = 0; i < NR_ATM_IOCTL; i++) {
-                if (cmd32 == atm_ioctl_map[i].cmd32) {
-                        cmd = atm_ioctl_map[i].cmd;
-                        break;
-                }
-        }
-        if (i == NR_ATM_IOCTL)
-                return -EINVAL;
-        
-        switch (cmd) {
-        case ATM_GETNAMES:
-                return do_atm_iobuf(fd, cmd, arg);
-            
-        case ATM_GETLINKRATE:
-        case ATM_GETTYPE:
-        case ATM_GETESI:
-        case ATM_GETADDR:
-        case ATM_RSTADDR:
-        case ATM_ADDADDR:
-        case ATM_DELADDR:
-        case ATM_GETCIRANGE:
-        case ATM_SETCIRANGE:
-        case ATM_SETESI:
-        case ATM_SETESIF:
-        case ATM_GETSTAT:
-        case ATM_GETSTATZ:
-        case ATM_GETLOOP:
-        case ATM_SETLOOP:
-        case ATM_QUERYLOOP:
-                return do_atmif_sioc(fd, cmd, arg);
-        }
-        return -EINVAL;
-}
 static __used int
 ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
@@ -1718,21 +1132,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
        return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
-/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
- * for some operations; this forces use of the newer bridge-utils that
- * use compatible ioctls
- */
-static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        u32 tmp;
-        if (get_user(tmp, (u32 __user *) arg))
-                return -EFAULT;
-        if (tmp == BRCTL_GET_VERSION)
-                return BRCTL_VERSION + 1;
-        return -EINVAL;
-}
 #define RTC_IRQP_READ32         _IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32          _IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32        _IOR('p', 0x0d, compat_ulong_t)
@@ -1979,18 +1378,6 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
 #endif
-/* Big T */
-COMPATIBLE_IOCTL(TUNSETNOCSUM)
-COMPATIBLE_IOCTL(TUNSETDEBUG)
-COMPATIBLE_IOCTL(TUNSETPERSIST)
-COMPATIBLE_IOCTL(TUNSETOWNER)
-COMPATIBLE_IOCTL(TUNSETLINK)
-COMPATIBLE_IOCTL(TUNSETGROUP)
-COMPATIBLE_IOCTL(TUNGETFEATURES)
-COMPATIBLE_IOCTL(TUNSETOFFLOAD)
-COMPATIBLE_IOCTL(TUNSETTXFILTER)
-COMPATIBLE_IOCTL(TUNGETSNDBUF)
-COMPATIBLE_IOCTL(TUNSETSNDBUF)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2032,30 +1419,6 @@ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
 COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
-COMPATIBLE_IOCTL(FIOSETOWN)
-COMPATIBLE_IOCTL(SIOCSPGRP)
-COMPATIBLE_IOCTL(FIOGETOWN)
-COMPATIBLE_IOCTL(SIOCGPGRP)
-COMPATIBLE_IOCTL(SIOCATMARK)
-COMPATIBLE_IOCTL(SIOCSIFLINK)
-COMPATIBLE_IOCTL(SIOCSIFENCAP)
-COMPATIBLE_IOCTL(SIOCGIFENCAP)
-COMPATIBLE_IOCTL(SIOCSIFNAME)
-COMPATIBLE_IOCTL(SIOCSARP)
-COMPATIBLE_IOCTL(SIOCGARP)
-COMPATIBLE_IOCTL(SIOCDARP)
-COMPATIBLE_IOCTL(SIOCSRARP)
-COMPATIBLE_IOCTL(SIOCGRARP)
-COMPATIBLE_IOCTL(SIOCDRARP)
-COMPATIBLE_IOCTL(SIOCADDDLCI)
-COMPATIBLE_IOCTL(SIOCDELDLCI)
-COMPATIBLE_IOCTL(SIOCGMIIPHY)
-COMPATIBLE_IOCTL(SIOCGMIIREG)
-COMPATIBLE_IOCTL(SIOCSMIIREG)
-COMPATIBLE_IOCTL(SIOCGIFVLAN)
-COMPATIBLE_IOCTL(SIOCSIFVLAN)
-COMPATIBLE_IOCTL(SIOCBRADDBR)
-COMPATIBLE_IOCTL(SIOCBRDELBR)
 #ifdef CONFIG_BLOCK
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
@@ -2311,22 +1674,6 @@ COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
 /* SMB ioctls which do not need any translations */
 COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
-/* Little a */
-COMPATIBLE_IOCTL(ATMSIGD_CTRL)
-COMPATIBLE_IOCTL(ATMARPD_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_MCAST)
-COMPATIBLE_IOCTL(ATMLEC_DATA)
-COMPATIBLE_IOCTL(ATM_SETSC)
-COMPATIBLE_IOCTL(SIOCSIFATMTCP)
-COMPATIBLE_IOCTL(SIOCMKCLIP)
-COMPATIBLE_IOCTL(ATMARP_MKIP)
-COMPATIBLE_IOCTL(ATMARP_SETENTRY)
-COMPATIBLE_IOCTL(ATMARP_ENCAP)
-COMPATIBLE_IOCTL(ATMTCP_CREATE)
-COMPATIBLE_IOCTL(ATMTCP_REMOVE)
-COMPATIBLE_IOCTL(ATMMPC_CTRL)
-COMPATIBLE_IOCTL(ATMMPC_DATA)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -2532,63 +1879,6 @@ COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 /* now things that need handlers */
-#ifdef CONFIG_NET
-HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
-HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
-HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCADDMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCDELMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFINDEX, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
-HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
-/* ioctls used by appletalk ddp.c */
-HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCDARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
-HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
-HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
-HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSETHWADDR, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSLAVEINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDCHANGEACTIVE, bond_ioctl)
-HANDLE_IOCTL(SIOCADDRT, routing_ioctl)
-HANDLE_IOCTL(SIOCDELRT, routing_ioctl)
-HANDLE_IOCTL(SIOCBRADDIF, dev_ifsioc)
-HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
-/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
-HANDLE_IOCTL(SIOCRTMSG, ret_einval)
-HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
-HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
-#endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
@@ -2613,31 +1903,6 @@ HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
 /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
-HANDLE_IOCTL(ATM_GETLINKRATE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETNAMES32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETTYPE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_RSTADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_ADDADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_DELADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESIF32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTAT32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTATZ32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_QUERYLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTAT, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTATZ, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_CLRDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
 /* loop */
@@ -2655,7 +1920,7 @@ COMPATIBLE_IOCTL(TIOCSLTC)
 #endif
 #ifdef TIOCSTART
 /*
- * For these two we have defintions in ioctls.h and/or termios.h on
+ * For these two we have definitions in ioctls.h and/or termios.h on
 * some architectures but no actual implemention.  Some applications
 * like bash call them if they are defined in the headers, so we provide
 * entries here to avoid syslog message spew.
@@ -2672,11 +1937,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
 HANDLE_IOCTL(I2C_FUNCS, w_long)
 HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
 HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* bridge */
-HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
-HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
 /* Not implemented in the native kernel */
-IGNORE_IOCTL(SIOCGIFCOUNT)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2831,12 +2092,6 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                        goto found_handler;
        }
-#ifdef CONFIG_NET
-        if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
-            cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-                error = siocdevprivate_ioctl(fd, cmd, arg);
-        } else
-#endif
        {
                static int count;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..0d23b52dd22c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -184,7 +184,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 /**
 * debugfs_create_file - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have
+ * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this paramater is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
@@ -195,8 +195,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 *        this file.
 *
 * This is the basic "create a file" function for debugfs.  It allows for a
- * wide range of flexibility in createing a file, or a directory (if you
+ * wide range of flexibility in creating a file, or a directory (if you want
- * want to create a directory, the debugfs_create_dir() function is
+ * to create a directory, the debugfs_create_dir() function is
 * recommended to be used instead.)
 *
 * This function will return a pointer to a dentry if it succeeds.  This
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..b912270942fa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        if (dio->bio)
                dio_bio_submit(dio);
-        /* All IO is now issued, send it on its way */
-        blk_run_address_space(inode->i_mapping);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
@@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
            ((rw & READ) || (dio->result == dio->size)))
                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED)
+        if (ret != -EIOCBQUEUED) {
+                /* All IO is now issued, send it on its way */
+                blk_run_address_space(inode->i_mapping);
                dio_await_completion(dio);
+        }
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_ODIRECT;
+                rw = WRITE_ODIRECT_PLUG;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 2863deb178e2..b5f89aef3b29 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -143,7 +143,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dlm_posix_lock);
-/* Returns failure iff a succesful lock operation should be canceled */
+/* Returns failure iff a successful lock operation should be canceled */
 static int dlm_plock_callback(struct plock_op *op)
 {
        struct file *file;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c063420..366c503f9657 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,10 +251,10 @@ ctl_table epoll_table[] = {
                .data           = &max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a339..c0c636e34f60 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -1209,9 +1208,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
-        retval = ima_bprm_check(bprm);
-        if (retval)
-                return retval;
        /* kernel module loader fixup */
        /* so we don't try to load run modprobe in kernel space. */
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119c..2d0f757fda3e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817fe..b1b178e61718 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,6 +49,7 @@
 #define EXOFS_MIN_PID   0x10000 /* Smallest partition ID */
 #define EXOFS_OBJ_OFF   0x10000 /* offset for objects */
 #define EXOFS_SUPER_ID  0x10000 /* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
 #define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
 /* exofs Application specific page/attribute */
@@ -78,17 +79,67 @@ enum {
 #define EXOFS_SUPER_MAGIC       0x5DF5
 /*
- * The file system control block - stored in an object's data (mainly, the one
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * This is where the in-memory superblock is stored on disk.
- * on disk.  Right now it just has a magic value, which is basically a sanity
- * check on our ability to communicate with the object store.
 */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
        __le64  s_nextid;       /* Highest object ID used */
-        __le32  s_numfiles;     /* Number of files on fs */
+        __le64  s_numfiles;     /* Number of files on fs */
+        __le32  s_version;      /* == EXOFS_FSCB_VER */
        __le16  s_magic;        /* Magic signature */
        __le16  s_newfs;        /* Non-zero if this is a new fs */
-};
+        /* From here on it's a static part, only written by mkexofs */
+        __le64  s_dev_table_oid;   /* Resurved, not used */
+        __le64  s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at begining. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+        __le32  cb_num_comps;
+        __le64  cb_stripe_unit;
+        __le32  cb_group_width;
+        __le32  cb_group_depth;
+        __le32  cb_mirror_cnt;
+        __le32  cb_raid_algorithm;
+} __packed;
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+        __le32  systemid_len;
+        u8      systemid[OSD_SYSTEMID_LEN];
+        __le64  long_name_offset;       /* If !0 then offset-in-file */
+        __le32  osdname_len;            /* */
+        u8      osdname[44];            /* Embbeded, Ususally an asci uuid */
+} __packed;
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+        __le32                          dt_version;     /* == EXOFS_DT_VER */
+        struct exofs_dt_data_map        dt_data_map;    /* Raid policy to use */
+        /* Resurved space For future use. Total includeing this:
+         * (8 * sizeof(le64))
+         */
+        __le64                          __Resurved[4];
+        __le64                          dt_num_devices; /* Array size */
+        struct exofs_dt_device_info     dt_dev_table[]; /* Array of devices */
+} __packed;
 /****************************************************************************
 * inode-related things
@@ -155,22 +206,4 @@ enum {
        (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
          EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-/*************************
- * function declarations *
- *************************/
-/* osd.c                 */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-                           const struct osd_obj_id *obj);
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
-static inline int exofs_check_ok(struct osd_request *or)
-{
-        return exofs_check_ok_resid(or, NULL, NULL);
-}
-int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
-int exofs_async_op(struct osd_request *or,
-        osd_req_done_fn *async_done, void *caller_context, u8 *cred);
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b22..c35fd4623986 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
 * along with exofs; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
 #include <linux/fs.h>
 #include <linux/time.h>
 #include "common.h"
-#ifndef __EXOFS_H__
+/* FIXME: Remove once pnfs hits mainline
-#define __EXOFS_H__
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
@@ -55,7 +59,7 @@
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
-        struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+        struct exofs_fscb s_fscb;               /* Written often, pre-allocate*/
        osd_id          s_pid;                  /* partition ID of file system*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
@@ -63,7 +67,11 @@ struct exofs_sb_info {
        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
-        uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct pnfs_osd_data_map data_map;      /* Default raid to use        */
+        unsigned        s_numdevs;              /* Num of devices in array    */
+        struct osd_dev  *s_ods[1];              /* Variable length, minimum 1 */
 };
 /*
@@ -79,6 +87,50 @@ struct exofs_i_info {
        struct inode   vfs_inode;          /* normal in-memory inode          */
 };
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+        return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+struct exofs_io_state {
+        struct kref             kref;
+        void                    *private;
+        exofs_io_done_fn        done;
+        struct exofs_sb_info    *sbi;
+        struct osd_obj_id       obj;
+        u8                      *cred;
+        /* Global read/write IO*/
+        loff_t                  offset;
+        unsigned long           length;
+        void                    *kern_buff;
+        struct bio              *bio;
+        /* Attributes */
+        unsigned                in_attr_len;
+        struct osd_attr         *in_attr;
+        unsigned                out_attr_len;
+        struct osd_attr         *out_attr;
+        /* Variable array of size numdevs */
+        unsigned numdevs;
+        struct exofs_per_dev_state {
+                struct osd_request *or;
+                struct bio *bio;
+        } per_dev[];
+};
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+        return sizeof(struct exofs_io_state) +
+                sizeof(struct exofs_per_dev_state) * numdevs;
+}
 /*
 * our inode flags
 */
@@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 /*************************
 * function declarations *
 *************************/
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length);
+int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+                                 struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_write(ios);
+}
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+                                struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_read(ios);
+}
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
@@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..698a8636d39c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,18 @@
 #include "exofs.h"
-#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG2(M...) do {} while (0)
-#  define EXOFS_DEBUG_OBJ_ISIZE 1
-#endif
+enum { BIO_MAX_PAGES_KMALLOC =
+                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
 struct page_collect {
        struct exofs_sb_info *sbi;
        struct request_queue *req_q;
        struct inode *inode;
        unsigned expected_pages;
+        struct exofs_io_state *ios;
        struct bio *bio;
        unsigned nr_pages;
@@ -54,22 +57,23 @@ struct page_collect {
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-                struct inode *inode)
+                       struct inode *inode)
 {
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        pcol->sbi = sbi;
-        pcol->req_q = osd_request_queue(sbi->s_dev);
+        /* Create master bios on first Q, later on cloning, each clone will be
+         * allocated on it's destination Q
+         */
+        pcol->req_q = osd_request_queue(sbi->s_ods[0]);
        pcol->inode = inode;
        pcol->expected_pages = expected_pages;
+        pcol->ios = NULL;
        pcol->bio = NULL;
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
-                     expected_pages);
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol)
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+        pcol->ios = NULL;
-                     pcol->inode->i_ino, pcol->expected_pages);
        /* this is probably the end of the loop but in writes
         * it might not end here. don't be left with nothing
         */
        if (!pcol->expected_pages)
-                pcol->expected_pages = 128;
+                pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
 }
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        int pages = min_t(unsigned, pcol->expected_pages,
+                          BIO_MAX_PAGES_KMALLOC);
+        if (!pcol->ios) { /* First time allocate io_state */
+                int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+                if (ret)
+                        return ret;
+        }
        for (; pages; pages >>= 1) {
-                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
                if (likely(pcol->bio))
                        return 0;
        }
-        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+        EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
                  pcol->expected_pages);
        return -ENOMEM;
 }
 static void pcol_free(struct page_collect *pcol)
 {
-        bio_put(pcol->bio);
+        if (pcol->bio) {
-        pcol->bio = NULL;
+                bio_put(pcol->bio);
+                pcol->bio = NULL;
+        }
+        if (pcol->ios) {
+                exofs_put_io_state(pcol->ios);
+                pcol->ios = NULL;
+        }
 }
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
@@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+static int __readpages_done(struct page_collect *pcol, bool do_unlock)
-                            bool do_unlock)
 {
        struct bio_vec *bvec;
        int i;
        u64 resid;
        u64 good_bytes;
        u64 length = 0;
-        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        int ret = exofs_check_io(pcol->ios, &resid);
-        osd_end_request(or);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
@@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
                else
                        page_stat = ret;
-                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
                          inode->i_ino, page->index,
                          page_stat ? "bad_bytes" : "good_bytes");
@@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 }
 /* callback of async reads */
-static void readpages_done(struct osd_request *or, void *p)
+static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(or, pcol, true);
+        __readpages_done(pcol, true);
        atomic_dec(&pcol->sbi->s_curr_pending);
-        kfree(p);
+        kfree(pcol);
 }
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
@@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
                unlock_page(page);
        }
-        pcol_free(pcol);
 }
 static int read_exec(struct page_collect *pcol, bool is_sync)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
        if (!pcol->bio)
@@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        /* see comment in _readpage() about sync reads */
        WARN_ON(is_sync && (pcol->nr_pages != 1));
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        ios->bio = pcol->bio;
-        if (unlikely(!or)) {
+        ios->length = pcol->length;
-                ret = -ENOMEM;
+        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-                goto err;
-        }
-        osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
        if (is_sync) {
-                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(or, pcol, false);
+                return __readpages_done(pcol, false);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        }
        *pcol_copy = *pcol;
-        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        ios->done = readpages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_read(oi, ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  obj.id, _LLU(i_start), pcol->length);
+                  ios->obj.id, _LLU(ios->offset), pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
        if (!is_sync)
                _unlock_pcol_pages(pcol, ret, READ);
-        else /* Pages unlocked by caller in sync mode only free bio */
-                pcol_free(pcol);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -370,12 +375,12 @@ try_again:
        if (len != PAGE_CACHE_SIZE)
                zero_user(page, len, PAGE_CACHE_SIZE - len);
-        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
        if (ret) {
-                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
                          page, len, pcol->nr_pages, pcol->length);
@@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync)
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,async) inside at several places
+        /* readpage_strip might call read_exec(,is_sync==false) at several
-         * but this is safe for is_async=0 since read_exec will not do anything
+         * places but not if we have a single page.
-         * when we have a single page.
         */
        ret = readpage_strip(&pcol, page);
        if (ret) {
@@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page)
        return _readpage(page, false);
 }
-/* Callback for osd_write. All writes are asynchronouse */
+/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct osd_request *or, void *p)
+static void writepages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
        struct bio_vec *bvec;
@@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p)
        u64 resid;
        u64  good_bytes;
        u64  length = 0;
+        int ret = exofs_check_io(ios, &resid);
-        int ret = exofs_check_ok_resid(or, NULL, &resid);
-        osd_end_request(or);
        atomic_dec(&pcol->sbi->s_curr_pending);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
@@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p)
                update_write_page(page, page_stat);
                unlock_page(page);
-                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
                             inode->i_ino, page->index, page_stat);
                length += bvec->bv_len;
@@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p)
 static int write_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
        if (!pcol->bio)
                return 0;
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
-                ret = -ENOMEM;
-                goto err;
-        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
        pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-        osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
-        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        ios->bio = pcol_copy->bio;
+        ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+        ios->length = pcol_copy->length;
+        ios->done = writepages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_write(oi, ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
                goto err;
        }
        atomic_inc(&pcol->sbi->s_curr_pending);
        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
                  pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol)
 err:
        _unlock_pcol_pages(pcol, ret, WRITE);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -586,6 +582,9 @@ static int writepage_strip(struct page *page,
                        if (PageError(page))
                                ClearPageError(page);
                        unlock_page(page);
+                        EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+                                     "outside the limits\n",
+                                     inode->i_ino, page->index);
                        return 0;
                }
        }
@@ -600,6 +599,9 @@ try_again:
                ret = write_exec(pcol);
                if (unlikely(ret))
                        goto fail;
+                EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+                             inode->i_ino, page->index);
                goto try_again;
        }
@@ -609,7 +611,7 @@ try_again:
                        goto fail;
        }
-        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
@@ -634,6 +636,8 @@ try_again:
        return 0;
 fail:
+        EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+                     inode->i_ino, page->index, ret);
        set_bit(AS_EIO, &page->mapping->flags);
        unlock_page(page);
        return ret;
@@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping,
                        wbc->range_end >> PAGE_CACHE_SHIFT;
        if (start || end)
-                expected_pages = min(end - start + 1, 32L);
+                expected_pages = end - start + 1;
        else
                expected_pages = mapping->nrpages;
-        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+        if (expected_pages < 32L)
-                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                expected_pages = 32L;
+        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+                     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-                     mapping->nrpages, start, end);
+                     mapping->nrpages, start, end, expected_pages);
        _pcol_init(&pcol, expected_pages, mapping->host);
@@ -771,19 +778,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+static int _do_truncate(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t isize = i_size_read(inode);
+        int ret;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        ret = exofs_oi_truncate(oi, (u64)isize);
+        EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+        return ret;
+}
 /*
 * Truncate a file to the specified size - all we have to do is set the size
 * attribute.  We make sure the object exists first.
 */
 void exofs_truncate(struct inode *inode)
 {
-        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        struct exofs_i_info *oi = exofs_i(inode);
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or;
-        struct osd_attr attr;
-        loff_t isize = i_size_read(inode);
-        __be64 newsize;
        int ret;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +809,6 @@ void exofs_truncate(struct inode *inode)
                return;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
-                goto fail;
-        }
-        osd_req_set_attributes(or, &obj);
-        newsize = cpu_to_be64((u64)isize);
-        attr = g_attr_logical_length;
-        attr.val_ptr = &newsize;
-        osd_req_add_set_attr_list(or, &attr, 1);
        /* if we are about to truncate an object, and it hasn't been
         * created yet, wait
@@ -816,8 +816,7 @@ void exofs_truncate(struct inode *inode)
        if (unlikely(wait_obj_created(oi)))
                goto fail;
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ret = _do_truncate(inode);
-        osd_end_request(or);
        if (ret)
                goto fail;
@@ -847,65 +846,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 /*
 * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * attribute in the 'obj_size' argument.
- * on.
 */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-                    struct exofs_fcb *inode, uint64_t *sanity)
+                    struct exofs_fcb *inode, uint64_t *obj_size)
 {
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_request *or;
+        struct osd_attr attrs[2];
-        struct osd_attr attr;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj = {sbi->s_pid,
-                                 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
        int ret;
-        exofs_make_credential(oi->i_cred, &obj);
+        *obj_size = ~0;
+        ret = exofs_get_io_state(sbi, &ios);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(ret)) {
-        if (unlikely(!or)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-                EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+                return ret;
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
-        /* we need the inode attribute */
+        ios->obj.id = exofs_oi_objno(oi);
-        osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+        exofs_make_credential(oi->i_cred, &ios->obj);
+        ios->cred = oi->i_cred;
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        attrs[0] = g_attr_inode_data;
-        /* we get the size attributes to do a sanity check */
+        attrs[1] = g_attr_logical_length;
-        osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+        ios->in_attr = attrs;
-#endif
+        ios->in_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ret = exofs_sbi_read(ios);
        if (ret)
                goto out;
-        attr = g_attr_inode_data;
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
                goto out;
        }
+        WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+        memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-        WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
-        memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-        attr = g_attr_logical_length;
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("ERROR: extract attr from or failed\n");
+                EXOFS_ERR("%s: extract_attr of logical_length failed\n",
+                          __func__);
                goto out;
        }
-        *sanity = get_unaligned_be64(attr.val_ptr);
+        *obj_size = get_unaligned_be64(attrs[1].val_ptr);
-#endif
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
+static void __oi_init(struct exofs_i_info *oi)
+{
+        init_waitqueue_head(&oi->i_wq);
+        oi->i_flags = 0;
+}
 /*
 * Fill in an inode read from the OSD and set it up for use
 */
@@ -914,7 +910,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        struct exofs_i_info *oi;
        struct exofs_fcb fcb;
        struct inode *inode;
-        uint64_t uninitialized_var(sanity);
+        uint64_t obj_size;
        int ret;
        inode = iget_locked(sb, ino);
@@ -923,13 +919,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        if (!(inode->i_state & I_NEW))
                return inode;
        oi = exofs_i(inode);
+        __oi_init(oi);
        /* read the inode from the osd */
-        ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+        ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
        if (ret)
                goto bad_inode;
-        init_waitqueue_head(&oi->i_wq);
        set_obj_created(oi);
        /* copy stuff from on-disk struct to in-memory struct */
@@ -947,14 +943,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_generation = le32_to_cpu(fcb.i_generation);
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        if ((inode->i_size != obj_size) &&
-        if ((inode->i_size != sanity) &&
                (!exofs_inode_is_fast_symlink(inode))) {
-                EXOFS_ERR("WARNING: Size of object from inode and "
+                EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
-                          "attributes differ (%lld != %llu)\n",
+                          inode->i_size, _LLU(obj_size));
-                          inode->i_size, _LLU(sanity));
+                /* FIXME: call exofs_inode_recovery() */
        }
-#endif
        oi->i_dir_start_lookup = 0;
@@ -1020,23 +1014,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 * set the obj_created flag so that other methods know that the object exists on
 * the OSD.
 */
-static void create_done(struct osd_request *or, void *p)
+static void create_done(struct exofs_io_state *ios, void *p)
 {
        struct inode *inode = p;
        struct exofs_i_info *oi = exofs_i(inode);
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        int ret;
-        ret = exofs_check_ok(or);
+        ret = exofs_check_io(ios, NULL);
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-                          _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
-                make_bad_inode(inode);
+                /*TODO: When FS is corrupted creation can fail, object already
-        } else
+                 * exist. Get rid of this asynchronous creation, if exist
-                set_obj_created(oi);
+                 * increment the obj counter and try the next object. Until we
+                 * succeed. All these dangling objects will be made into lost
+                 * files by chkfs.exofs
+                 */
+        }
+        set_obj_created(oi);
        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
@@ -1051,8 +1052,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        struct inode *inode;
        struct exofs_i_info *oi;
        struct exofs_sb_info *sbi;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret;
        sb = dir->i_sb;
@@ -1061,8 +1061,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
                return ERR_PTR(-ENOMEM);
        oi = exofs_i(inode);
+        __oi_init(oi);
-        init_waitqueue_head(&oi->i_wq);
        set_obj_2bcreated(oi);
        sbi = sb->s_fs_info;
@@ -1089,28 +1089,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        mark_inode_dirty(inode);
-        obj.partition = sbi->s_pid;
+        ret = exofs_get_io_state(sbi, &ios);
-        obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+        if (unlikely(ret)) {
-        exofs_make_credential(oi->i_cred, &obj);
+                EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+                return ERR_PTR(ret);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
-                return ERR_PTR(-ENOMEM);
        }
-        osd_req_create_object(or, &obj);
+        ios->obj.id = exofs_oi_objno(oi);
+        exofs_make_credential(oi->i_cred, &ios->obj);
        /* increment the refcount so that the inode will still be around when we
         * reach the callback
         */
        atomic_inc(&inode->i_count);
-        ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+        ios->done = create_done;
+        ios->private = inode;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_create(ios);
        if (ret) {
                atomic_dec(&inode->i_count);
-                osd_end_request(or);
+                exofs_put_io_state(ios);
-                return ERR_PTR(-EIO);
+                return ERR_PTR(ret);
        }
        atomic_inc(&sbi->s_curr_pending);
@@ -1128,11 +1128,11 @@ struct updatei_args {
 /*
 * Callback function from exofs_update_inode().
 */
-static void updatei_done(struct osd_request *or, void *p)
+static void updatei_done(struct exofs_io_state *ios, void *p)
 {
        struct updatei_args *args = p;
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&args->sbi->s_curr_pending);
@@ -1148,8 +1148,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        struct osd_attr attr;
        struct exofs_fcb *fcb;
        struct updatei_args *args;
@@ -1186,18 +1185,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        } else
                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(sbi, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-                ret = -ENOMEM;
                goto free_args;
        }
-        osd_req_set_attributes(or, &obj);
        attr = g_attr_inode_data;
        attr.val_ptr = fcb;
-        osd_req_add_set_attr_list(or, &attr, 1);
+        ios->out_attr_len = 1;
+        ios->out_attr = &attr;
        if (!obj_created(oi)) {
                EXOFS_DBGMSG("!obj_created\n");
@@ -1206,22 +1203,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
                EXOFS_DBGMSG("wait_event done\n");
        }
-        if (do_sync) {
+        if (!do_sync) {
-                ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-                osd_end_request(or);
-                goto free_args;
-        } else {
                args->sbi = sbi;
+                ios->done = updatei_done;
+                ios->private = args;
+        }
-                ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+        ret = exofs_oi_write(oi, ios);
-                if (ret) {
+        if (!do_sync && !ret) {
-                        osd_end_request(or);
-                        goto free_args;
-                }
                atomic_inc(&sbi->s_curr_pending);
                goto out; /* deallocation in updatei_done */
        }
+        exofs_put_io_state(ios);
 free_args:
        kfree(args);
 out:
@@ -1238,11 +1232,12 @@ int exofs_write_inode(struct inode *inode, int wait)
 * Callback function from exofs_delete_inode() - don't have much cleaning up to
 * do.
 */
-static void delete_done(struct osd_request *or, void *p)
+static void delete_done(struct exofs_io_state *ios, void *p)
 {
-        struct exofs_sb_info *sbi;
+        struct exofs_sb_info *sbi = p;
-        osd_end_request(or);
-        sbi = p;
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
 }
@@ -1256,8 +1251,7 @@ void exofs_delete_inode(struct inode *inode)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1268,26 @@ void exofs_delete_inode(struct inode *inode)
        clear_inode(inode);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(sbi, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
                return;
        }
-        osd_req_remove_object(or, &obj);
        /* if we are deleting an obj that hasn't been created yet, wait */
        if (!obj_created(oi)) {
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
        }
-        ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->done = delete_done;
+        ios->private = sbi;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_remove(ios);
        if (ret) {
-                EXOFS_ERR(
+                EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
-                       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+                exofs_put_io_state(ios);
-                osd_end_request(or);
                return;
        }
        atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 000000000000..5bad01fa1f9f
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <scsi/scsi_device.h>
+#include "exofs.h"
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length)
+{
+        struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*      struct osd_sense_info osi = {.key = 0};*/
+        int ret;
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+                return -ENOMEM;
+        }
+        ret = osd_req_read_kern(or, obj, offset, p, length);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+                goto out;
+        }
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                goto out;
+        }
+        ret = osd_execute_request(or);
+        if (unlikely(ret))
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+out:
+        osd_end_request(or);
+        return ret;
+}
+int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+{
+        struct exofs_io_state *ios;
+        /*TODO: Maybe use kmem_cach per sbi of size
+         * exofs_io_state_size(sbi->s_numdevs)
+         */
+        ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+        if (unlikely(!ios)) {
+                *pios = NULL;
+                return -ENOMEM;
+        }
+        ios->sbi = sbi;
+        ios->obj.partition = sbi->s_pid;
+        *pios = ios;
+        return 0;
+}
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+        if (ios) {
+                unsigned i;
+                for (i = 0; i < ios->numdevs; i++) {
+                        struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+                        if (per_dev->or)
+                                osd_end_request(per_dev->or);
+                        if (per_dev->bio)
+                                bio_put(per_dev->bio);
+                }
+                kfree(ios);
+        }
+}
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+        struct completion *waiting = p;
+        complete(waiting);
+}
+static void _last_io(struct kref *kref)
+{
+        struct exofs_io_state *ios = container_of(
+                                        kref, struct exofs_io_state, kref);
+        ios->done(ios, ios->private);
+}
+static void _done_io(struct osd_request *or, void *p)
+{
+        struct exofs_io_state *ios = p;
+        kref_put(&ios->kref, _last_io);
+}
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        bool sync = (ios->done == NULL);
+        int i, ret;
+        if (sync) {
+                ios->done = _sync_done;
+                ios->private = &wait;
+        }
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                ret = osd_finalize_request(or, 0, ios->cred, NULL);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+                                     ret);
+                        return ret;
+                }
+        }
+        kref_init(&ios->kref);
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                kref_get(&ios->kref);
+                osd_execute_request_async(or, _done_io, ios);
+        }
+        kref_put(&ios->kref, _last_io);
+        ret = 0;
+        if (sync) {
+                wait_for_completion(&wait);
+                ret = exofs_check_io(ios, NULL);
+        }
+        return ret;
+}
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+        enum osd_err_priority acumulated_osd_err = 0;
+        int acumulated_lin_err = 0;
+        int i;
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_sense_info osi;
+                int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+                if (likely(!ret))
+                        continue;
+                if (unlikely(ret == -EFAULT)) {
+                        EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
+                        /*FIXME: All the pages in this device range should:
+                         *      clear_highpage(page);
+                         */
+                }
+                if (osi.osd_err_pri >= acumulated_osd_err) {
+                        acumulated_osd_err = osi.osd_err_pri;
+                        acumulated_lin_err = ret;
+                }
+        }
+        /* TODO: raid specific residual calculations */
+        if (resid) {
+                if (likely(!acumulated_lin_err))
+                        *resid = 0;
+                else
+                        *resid = ios->length;
+        }
+        return acumulated_lin_err;
+}
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_create_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_remove_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                if (ios->bio) {
+                        struct bio *bio;
+                        if (i != 0) {
+                                bio = bio_kmalloc(GFP_KERNEL,
+                                                  ios->bio->bi_max_vecs);
+                                if (unlikely(!bio)) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                __bio_clone(bio, ios->bio);
+                                bio->bi_bdev = NULL;
+                                bio->bi_next = NULL;
+                                ios->per_dev[i].bio =  bio;
+                        } else {
+                                bio = ios->bio;
+                        }
+                        osd_req_write(or, &ios->obj, ios->offset, bio,
+                                      ios->length);
+/*                      EXOFS_DBGMSG("write sync=%d\n", sync);*/
+                } else if (ios->kern_buff) {
+                        osd_req_write_kern(or, &ios->obj, ios->offset,
+                                           ios->kern_buff, ios->length);
+/*                      EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+                } else {
+                        osd_req_set_attributes(or, &ios->obj);
+/*                      EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+                }
+                if (ios->out_attr)
+                        osd_req_add_set_attr_list(or, ios->out_attr,
+                                                  ios->out_attr_len);
+                if (ios->in_attr)
+                        osd_req_add_get_attr_list(or, ios->in_attr,
+                                                  ios->in_attr_len);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < 1; i++) {
+                struct osd_request *or;
+                unsigned first_dev = (unsigned)ios->obj.id;
+                first_dev %= ios->sbi->s_numdevs;
+                or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                if (ios->bio) {
+                        osd_req_read(or, &ios->obj, ios->offset, ios->bio,
+                                     ios->length);
+/*                      EXOFS_DBGMSG("read sync=%d\n", sync);*/
+                } else if (ios->kern_buff) {
+                        osd_req_read_kern(or, &ios->obj, ios->offset,
+                                           ios->kern_buff, ios->length);
+/*                      EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+                } else {
+                        osd_req_get_attributes(or, &ios->obj);
+/*                      EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+                }
+                if (ios->out_attr)
+                        osd_req_add_set_attr_list(or, ios->out_attr,
+                                                  ios->out_attr_len);
+                if (ios->in_attr)
+                        osd_req_add_get_attr_list(or, ios->in_attr,
+                                                  ios->in_attr_len);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+        void *iter = NULL;
+        int nelem;
+        do {
+                nelem = 1;
+                osd_req_decode_get_attr_list(ios->per_dev[0].or,
+                                             &cur_attr, &nelem, &iter);
+                if ((cur_attr.attr_page == attr->attr_page) &&
+                    (cur_attr.attr_id == attr->attr_id)) {
+                        attr->len = cur_attr.len;
+                        attr->val_ptr = cur_attr.val_ptr;
+                        return 0;
+                }
+        } while (iter);
+        return -EIO;
+}
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+        struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+        struct exofs_io_state *ios;
+        struct osd_attr attr;
+        __be64 newsize;
+        int i, ret;
+        if (exofs_get_io_state(sbi, &ios))
+                return -ENOMEM;
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        newsize = cpu_to_be64(size);
+        attr = g_attr_logical_length;
+        attr.val_ptr = &newsize;
+        for (i = 0; i < sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_set_attributes(or, &ios->obj);
+                osd_req_add_set_attr_list(or, &attr, 1);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        exofs_put_io_state(ios);
+        return ret;
+}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df284..000000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <scsi/scsi_device.h>
-#include <scsi/osd_sense.h>
-#include "exofs.h"
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
-{
-        struct osd_sense_info osi;
-        int ret = osd_req_decode_sense(or, &osi);
-        if (ret) { /* translate to Linux codes */
-                if (osi.additional_code == scsi_invalid_field_in_cdb) {
-                        if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
-                                ret = -EFAULT;
-                        if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
-                                ret = -ENOENT;
-                        else
-                                ret = -EINVAL;
-                } else if (osi.additional_code == osd_quota_error)
-                        ret = -ENOSPC;
-                else
-                        ret = -EIO;
-        }
-        /* FIXME: should be include in osd_sense_info */
-        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->resid_len : 0;
-        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->resid_len : 0;
-        return ret;
-}
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-/*
- * Perform a synchronous OSD operation.
- */
-int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-        int ret;
-        or->timeout = timeout;
-        ret = osd_finalize_request(or, 0, credential, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request(or);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-        /* osd_req_decode_sense(or, ret); */
-        return ret;
-}
-/*
- * Perform an asynchronous OSD operation.
- */
-int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-                   void *caller_context, u8 *cred)
-{
-        int ret;
-        ret = osd_finalize_request(or, 0, cred, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request_async(or, async_done, caller_context);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
-        return ret;
-}
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-        void *iter = NULL;
-        int nelem;
-        do {
-                nelem = 1;
-                osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-                if ((cur_attr.attr_page == attr->attr_page) &&
-                    (cur_attr.attr_id == attr->attr_id)) {
-                        attr->len = cur_attr.len;
-                        attr->val_ptr = cur_attr.val_ptr;
-                        return 0;
-                }
-        } while (iter);
-        return -EIO;
-}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..423033addd1f
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+/* FIXME: Remove this file once pnfs hits mainline */
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+#if defined(CONFIG_PNFS)
+/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
+#include "../nfs/objlayout/pnfs_osd_xdr.h"
+#else /* defined(CONFIG_PNFS) */
+enum pnfs_iomode {
+        IOMODE_READ = 1,
+        IOMODE_RW = 2,
+        IOMODE_ANY = 3,
+};
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+        PNFS_OSD_RAID_0         = 1,
+        PNFS_OSD_RAID_4         = 2,
+        PNFS_OSD_RAID_5         = 3,
+        PNFS_OSD_RAID_PQ        = 4     /* Reed-Solomon P+Q */
+};
+struct pnfs_osd_data_map {
+        u32     odm_num_comps;
+        u64     odm_stripe_unit;
+        u32     odm_group_width;
+        u32     odm_group_depth;
+        u32     odm_mirror_cnt;
+        u32     odm_raid_algorithm;
+};
+#endif /* else defined(CONFIG_PNFS) */
+#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b59..a1d1e77b12eb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret = -ENOMEM;
-        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
-        if (!fscb) {
-                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return -ENOMEM;
-        }
        lock_super(sb);
        sbi = sb->s_fs_info;
+        fscb = &sbi->s_fscb;
+        ret = exofs_get_io_state(sbi, &ios);
+        if (ret)
+                goto out;
+        /* Note: We only write the changing part of the fscb. .i.e upto the
+         *       the fscb->s_dev_table_oid member. There is no read-modify-write
+         *       here.
+         */
+        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
        fscb->s_magic = cpu_to_le16(sb->s_magic);
        fscb->s_newfs = 0;
+        fscb->s_version = EXOFS_FSCB_VER;
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ios->obj.id = EXOFS_SUPER_ID;
-        if (unlikely(!or)) {
+        ios->offset = 0;
-                EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+        ios->kern_buff = fscb;
-                goto out;
+        ios->cred = sbi->s_cred;
-        }
-        obj.partition = sbi->s_pid;
+        ret = exofs_sbi_write(ios);
-        obj.id = EXOFS_SUPER_ID;
-        ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-                goto out;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
                goto out;
        }
        sb->s_dirt = 0;
 out:
-        if (or)
+        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
-                osd_end_request(or);
+        exofs_put_io_state(ios);
        unlock_super(sb);
-        kfree(fscb);
        return ret;
 }
@@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb)
                sb->s_dirt = 0;
 }
+static void _exofs_print_device(const char *msg, const char *dev_path,
+                                struct osd_dev *od, u64 pid)
+{
+        const struct osd_dev_info *odi = osduld_device_info(od);
+        printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+                msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+        while (sbi->s_numdevs) {
+                int i = --sbi->s_numdevs;
+                struct osd_dev *od = sbi->s_ods[i];
+                if (od) {
+                        sbi->s_ods[i] = NULL;
+                        osduld_put_device(od);
+                }
+        }
+        kfree(sbi);
+}
 /*
 * This function is called when the vfs is freeing the superblock.  We just
 * need to free our own part.
@@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb)
                                  msecs_to_jiffies(100));
        }
-        osduld_put_device(sbi->s_dev);
+        _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
-        kfree(sb->s_fs_info);
+        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+                                    struct exofs_device_table *dt)
+{
+        sbi->data_map.odm_num_comps   =
+                                le32_to_cpu(dt->dt_data_map.cb_num_comps);
+        sbi->data_map.odm_stripe_unit =
+                                le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+        sbi->data_map.odm_group_width =
+                                le32_to_cpu(dt->dt_data_map.cb_group_width);
+        sbi->data_map.odm_group_depth =
+                                le32_to_cpu(dt->dt_data_map.cb_group_depth);
+        sbi->data_map.odm_mirror_cnt  =
+                                le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+        sbi->data_map.odm_raid_algorithm  =
+                                le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+/* FIXME: Hard coded mirror only for now. if not so do not mount */
+        if ((sbi->data_map.odm_num_comps != numdevs) ||
+            (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
+            (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
+            (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+                return -EINVAL;
+        else
+                return 0;
+}
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+                             struct osd_dev_info *odi)
+{
+        odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+        memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+        odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+        odi->osdname = dt_dev->osdname;
+        /* FIXME support long names. Will need a _put function */
+        if (dt_dev->long_name_offset)
+                return -EINVAL;
+        /* Make sure osdname is printable!
+         * mkexofs should give us space for a null-terminator else the
+         * device-table is invalid.
+         */
+        if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+                odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+        dt_dev->osdname[odi->osdname_len] = 0;
+        /* If it's all zeros something is bad we read past end-of-obj */
+        return !(odi->systemid_len || odi->osdname_len);
+}
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+                                       unsigned table_count)
+{
+        struct exofs_sb_info *sbi = *psbi;
+        struct osd_dev *fscb_od;
+        struct osd_obj_id obj = {.partition = sbi->s_pid,
+                                 .id = EXOFS_DEVTABLE_ID};
+        struct exofs_device_table *dt;
+        unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+                                             sizeof(*dt);
+        unsigned numdevs, i;
+        int ret;
+        dt = kmalloc(table_bytes, GFP_KERNEL);
+        if (unlikely(!dt)) {
+                EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+                          table_bytes);
+                return -ENOMEM;
+        }
+        fscb_od = sbi->s_ods[0];
+        sbi->s_ods[0] = NULL;
+        sbi->s_numdevs = 0;
+        ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+        if (unlikely(ret)) {
+                EXOFS_ERR("ERROR: reading device table\n");
+                goto out;
+        }
+        numdevs = le64_to_cpu(dt->dt_num_devices);
+        if (unlikely(!numdevs)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        WARN_ON(table_count != numdevs);
+        ret = _read_and_match_data_map(sbi, numdevs, dt);
+        if (unlikely(ret))
+                goto out;
+        if (likely(numdevs > 1)) {
+                unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+                sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+                if (unlikely(!sbi)) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+                *psbi = sbi;
+        }
+        for (i = 0; i < numdevs; i++) {
+                struct exofs_fscb fscb;
+                struct osd_dev_info odi;
+                struct osd_dev *od;
+                if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+                        EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+                printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+                       i, odi.osdname);
+                /* On all devices the device table is identical. The user can
+                 * specify any one of the participating devices on the command
+                 * line. We always keep them in device-table order.
+                 */
+                if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+                        sbi->s_ods[i] = fscb_od;
+                        ++sbi->s_numdevs;
+                        fscb_od = NULL;
+                        continue;
+                }
+                od = osduld_info_lookup(&odi);
+                if (unlikely(IS_ERR(od))) {
+                        ret = PTR_ERR(od);
+                        EXOFS_ERR("ERROR: device requested is not found "
+                                  "osd_name-%s =>%d\n", odi.osdname, ret);
+                        goto out;
+                }
+                sbi->s_ods[i] = od;
+                ++sbi->s_numdevs;
+                /* Read the fscb of the other devices to make sure the FS
+                 * partition is there.
+                 */
+                ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+                                      sizeof(fscb));
+                if (unlikely(ret)) {
+                        EXOFS_ERR("ERROR: Malformed participating device "
+                                  "error reading fscb osd_name-%s\n",
+                                  odi.osdname);
+                        goto out;
+                }
+                /* TODO: verify other information is correct and FS-uuid
+                 *       matches. Benny what did you say about device table
+                 *       generation and old devices?
+                 */
+        }
+out:
+        kfree(dt);
+        if (unlikely(!ret && fscb_od)) {
+                EXOFS_ERR(
+                      "ERROR: Bad device-table container device not present\n");
+                osduld_put_device(fscb_od);
+                ret = -EINVAL;
+        }
+        return ret;
+}
 /*
 * Read the superblock from the OSD and fill in the fields
 */
@@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        struct exofs_mountopt *opts = data;
        struct exofs_sb_info *sbi;      /*extended info                  */
+        struct osd_dev *od;             /* Master device                 */
        struct exofs_fscb fscb;         /*on-disk superblock info        */
-        struct osd_request *or = NULL;
        struct osd_obj_id obj;
+        unsigned table_count;
        int ret;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        sb->s_fs_info = sbi;
        /* use mount options to fill superblock */
-        sbi->s_dev = osduld_path_lookup(opts->dev_name);
+        od = osduld_path_lookup(opts->dev_name);
-        if (IS_ERR(sbi->s_dev)) {
+        if (IS_ERR(od)) {
-                ret = PTR_ERR(sbi->s_dev);
+                ret = PTR_ERR(od);
-                sbi->s_dev = NULL;
                goto free_sbi;
        }
+        sbi->s_ods[0] = od;
+        sbi->s_numdevs = 1;
        sbi->s_pid = opts->pid;
        sbi->s_timeout = opts->timeout;
@@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_bdev = NULL;
        sb->s_dev = 0;
-        /* read data from on-disk superblock object */
        obj.partition = sbi->s_pid;
        obj.id = EXOFS_SUPER_ID;
        exofs_make_credential(sbi->s_cred, &obj);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(!or)) {
+        if (unlikely(ret))
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_start_request failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_req_read_kern failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
-                ret = -EIO;
                goto free_sbi;
-        }
        sb->s_magic = le16_to_cpu(fscb.s_magic);
        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -EINVAL;
                goto free_sbi;
        }
+        if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+                EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+                          EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+                ret = -EINVAL;
+                goto free_sbi;
+        }
        /* start generation numbers from a random point */
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        table_count = le64_to_cpu(fscb.s_dev_table_count);
+        if (table_count) {
+                ret = exofs_read_lookup_dev_table(&sbi, table_count);
+                if (unlikely(ret))
+                        goto free_sbi;
+        }
        /* set up operation vectors */
+        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
        root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = 0;
+        _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
-out:
+                            sbi->s_pid);
-        if (or)
+        return 0;
-                osd_end_request(or);
-        return ret;
 free_sbi:
-        osduld_put_device(sbi->s_dev); /* NULL safe */
+        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-        kfree(sbi);
+                  opts->dev_name, sbi->s_pid, ret);
-        goto out;
+        exofs_free_sbi(sbi);
+        return ret;
 }
 /*
@@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, 0};
+        struct exofs_io_state *ios;
        struct osd_attr attrs[] = {
                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        uint64_t capacity = ULLONG_MAX;
        uint64_t used = ULLONG_MAX;
-        struct osd_request *or;
        uint8_t cred_a[OSD_CAP_LEN];
        int ret;
-        /* get used/capacity attributes */
+        ret = exofs_get_io_state(sbi, &ios);
-        exofs_make_credential(cred_a, &obj);
+        if (ret) {
+                EXOFS_DBGMSG("exofs_get_io_state failed.\n");
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+                return ret;
-        if (unlikely(!or)) {
-                EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
+        exofs_make_credential(cred_a, &ios->obj);
-        osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+        ios->cred = sbi->s_cred;
-        ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+        ios->in_attr = attrs;
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
        if (unlikely(ret))
                goto out;
-        ret = extract_attr_from_req(or, &attrs[0]);
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        if (likely(!ret))
+        if (likely(!ret)) {
                capacity = get_unaligned_be64(attrs[0].val_ptr);
-        else
+                if (unlikely(!capacity))
+                        capacity = ULLONG_MAX;
+        } else
                EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-        ret = extract_attr_from_req(or, &attrs[1]);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
        if (likely(!ret))
                used = get_unaligned_be64(attrs[1].val_ptr);
        else
@@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /* fill in the stats buffer */
        buf->f_type = EXOFS_SUPER_MAGIC;
        buf->f_bsize = EXOFS_BLKSIZE;
-        buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+        buf->f_blocks = capacity >> 9;
-        buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+        buf->f_bfree = (capacity - used) >> 9;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_numfiles;
        buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
        buf->f_namelen = EXOFS_NAME_LEN;
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..2db957778903 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2033,7 +2033,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext3_get_branch(inode, k, offsets, chain, &err);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1a..9acf7e808139 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,6 +26,16 @@ config EXT4_FS
          If unsure, say N.
+config EXT4_USE_FOR_EXT23
+        bool "Use ext4 for ext2/ext3 file systems"
+        depends on EXT3_FS=n || EXT2_FS=n
+        default y
+        help
+          Allow the ext4 file system driver code to be used for ext2 or
+          ext3 file system mounts.  This allows users to reduce their
+          compiled kernel size by using one file system driver for
+          ext2, ext3, and ext4 file systems.
 config EXT4_FS_XATTR
        bool "Ext4 extended attributes"
        depends on EXT4_FS
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..22bc7435d913 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,44 +499,6 @@ error_return:
 }
 /**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:             handle for this transaction
- * @inode:              inode
- * @block:              start physical block to free
- * @count:              number of blocks to count
- * @metadata:           Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
-                        int metadata)
-{
-        struct super_block *sb;
-        unsigned long dquot_freed_blocks;
-        /* this isn't the right place to decide whether block is metadata
-         * inode.c/extents.c knows better, but for safety ... */
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
-        /* We need to make sure we don't reuse
-         * block released untill the transaction commit.
-         * writeback mode have weak data consistency so
-         * don't force data as metadata when freeing block
-         * for writeback mode.
-         */
-        if (metadata == 0 && !ext4_should_writeback_data(inode))
-                metadata = 1;
-        sb = inode->i_sb;
-        ext4_mb_free_blocks(handle, inode, block, count,
-                            metadata, &dquot_freed_blocks);
-        if (dquot_freed_blocks)
-                vfs_dq_free_block(inode, dquot_freed_blocks);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
 {
-        return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+        if (!ext4_bg_has_super(sb, group))
+                return 0;
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+        else
+                return EXT4_SB(sb)->s_gdb_count;
 }
 /**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..4df8621ec31c 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
                if (ext4_bg_has_super(sb, i) &&
                    ((i < 5) || ((i % flex_size) == 0)))
                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
-                                        sbi->s_gdb_count + 1);
+                                        ext4_bg_num_gdb(sb, i) + 1);
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
                if (ret)
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        struct rb_node *n = sbi->system_blks.rb_node;
        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;
        while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..ab31e65d46d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -376,6 +376,12 @@ struct ext4_new_group_data {
                                         EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 /*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA       0x0001
+#define EXT4_FREE_BLOCKS_FORGET         0x0002
+/*
 * ioctl commands
 */
 #define EXT4_IOC_GETFLAGS               FS_IOC_GETFLAGS
@@ -703,6 +709,13 @@ struct ext4_inode_info {
        struct list_head i_aio_dio_complete_list;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        /*
+         * Transactions that contain inode's metadata needed to complete
+         * fsync and fdatasync, respectively.
+         */
+        tid_t i_sync_tid;
+        tid_t i_datasync_tid;
 };
 /*
@@ -750,6 +763,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
@@ -1324,8 +1338,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1396,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t, unsigned long, int, unsigned long *);
+                             struct buffer_head *bh, ext4_fsblk_t block,
+                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
                                                ext4_group_t, int);
 /* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..b57e5c711b6d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
 #include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
        return err;
 }
-int __ext4_journal_forget(const char *where, handle_t *handle,
+/*
-                                struct buffer_head *bh)
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+                  struct inode *inode, struct buffer_head *bh,
+                  ext4_fsblk_t blocknr)
 {
-        int err = 0;
+        int err;
-        if (ext4_handle_valid(handle)) {
+        might_sleep();
-                err = jbd2_journal_forget(handle, bh);
-                if (err)
+        trace_ext4_forget(inode, is_metadata, blocknr);
-                        ext4_journal_abort_handle(where, __func__, bh,
+        BUFFER_TRACE(bh, "enter");
-                                                  handle, err);
-        }
+        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-        else
+                  "data mode %x\n",
+                  bh, is_metadata, inode->i_mode,
+                  test_opt(inode->i_sb, DATA_FLAGS));
+        /* In the no journal case, we can just do a bforget and return */
+        if (!ext4_handle_valid(handle)) {
                bforget(bh);
-        return err;
+                return 0;
-}
+        }
-int __ext4_journal_revoke(const char *where, handle_t *handle,
+        /* Never use the revoke function if we are doing full data
-                                ext4_fsblk_t blocknr, struct buffer_head *bh)
+         * journaling: there is no need to, and a V1 superblock won't
-{
+         * support it.  Otherwise, only skip the revoke on un-journaled
-        int err = 0;
+         * data blocks. */
-        if (ext4_handle_valid(handle)) {
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-                err = jbd2_journal_revoke(handle, blocknr, bh);
+            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (err)
+                if (bh) {
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                                                  handle, err);
+                        err = jbd2_journal_forget(handle, bh);
+                        if (err)
+                                ext4_journal_abort_handle(where, __func__, bh,
+                                                          handle, err);
+                        return err;
+                }
+                return 0;
        }
-        else
-                bforget(bh);
+        /*
+         * data!=journal && (is_metadata || should_journal_data(inode))
+         */
+        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+        err = jbd2_journal_revoke(handle, blocknr, bh);
+        if (err) {
+                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+                ext4_abort(inode->i_sb, __func__,
+                           "error %d when attempting revoke", err);
+        }
+        BUFFER_TRACE(bh, "exit");
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..05eca817d704 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
 #define EXT4_DATA_TRANS_BLOCKS(sb)      (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
-                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /*
 * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
 * This include super block, inode block, quota blocks and xattr blocks
 */
 #define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
-                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
@@ -92,6 +92,7 @@
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
@@ -99,6 +100,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 int
 ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
- * Wrapper functions with which ext4 calls into JBD.  The intent here is
+ * Wrapper functions with which ext4 calls into JBD.
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs.  This work hasn't
- * been done yet.
 */
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
-/* When called with an invalid handle, this will still do a put on the BH */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
-int __ext4_journal_forget(const char *where, handle_t *handle,
+                  struct inode *inode, struct buffer_head *bh,
-                                struct buffer_head *bh);
+                  ext4_fsblk_t blocknr);
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-                                ext4_fsblk_t blocknr, struct buffer_head *bh);
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+        __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+                      (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__func__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
        return 0;
 }
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+                                                 struct inode *inode,
+                                                 int datasync)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                if (datasync)
+                        ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..3a7928f825e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1007,7 +1007,8 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);
@@ -1761,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
        while (block < last && block != EXT_MAX_BLOCK) {
                num = last - block;
                /* find extent for this block */
+                down_read(&EXT4_I(inode)->i_data_sem);
                path = ext4_ext_find_extent(inode, block, path);
+                up_read(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        path = NULL;
@@ -1957,7 +1960,6 @@ errout:
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path)
 {
-        struct buffer_head *bh;
        int err;
        ext4_fsblk_t leaf;
@@ -1973,9 +1975,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        bh = sb_find_get_block(inode->i_sb, leaf);
+        ext4_free_blocks(handle, inode, 0, leaf, 1,
-        ext4_forget(handle, 1, inode, bh, leaf);
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
-        ext4_free_blocks(handle, inode, leaf, 1, 1);
        return err;
 }
@@ -2042,12 +2043,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_extent *ex,
                                ext4_lblk_t from, ext4_lblk_t to)
 {
-        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-        int i, metadata = 0;
+        int flags = EXT4_FREE_BLOCKS_FORGET;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
+                flags |= EXT4_FREE_BLOCKS_METADATA;
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2072,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                for (i = 0; i < num; i++) {
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
-                        bh = sb_find_get_block(inode->i_sb, start + i);
-                        ext4_forget(handle, 0, inode, bh, start + i);
-                }
-                ext4_free_blocks(handle, inode, start, num, metadata);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2167,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                if (err)
@@ -3064,6 +3060,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
                ret = ext4_convert_unwritten_extents_dio(handle, inode,
                                                        path);
+                if (ret >= 0)
+                        ext4_update_inode_fsync_trans(handle, inode, 1);
                goto out2;
        }
        /* buffered IO case */
@@ -3091,6 +3089,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode,
                                                path, iblock,
                                                max_blocks);
+        if (ret >= 0)
+                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
        if (ret <= 0) {
                err = ret;
@@ -3319,8 +3319,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
-                                        ext4_ext_get_actual_len(&newex), 0);
+                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
@@ -3329,10 +3329,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        allocated = ext4_ext_get_actual_len(&newex);
        set_buffer_new(bh_result);
-        /* Cache only when it is _not_ an uninitialized extent */
+        /*
-        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+         * Cache the extent and update transaction to commit on fdatasync only
+         * when it is _not_ an uninitialized extent.
+         */
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
+                ext4_update_inode_fsync_trans(handle, inode, 1);
+        } else
+                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
        if (allocated > max_blocks)
                allocated = max_blocks;
@@ -3720,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..0b22497d92e1 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int err, ret = 0;
+        int ret;
+        tid_t commit_tid;
        J_ASSERT(ext4_journal_current_handle() == NULL);
        trace_ext4_sync_file(file, dentry, datasync);
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
        ret = flush_aio_dio_completed_IO(inode);
        if (ret < 0)
-                goto out;
+                return ret;
+        
+        if (!journal)
+                return simple_fsync(file, dentry, datasync);
        /*
-         * data=writeback:
+         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
-         *  sync_inode() will sync the metadata
+         *  Metadata is in the journal, we wait for proper transaction to
-         *
+         *  commit here.
-         * data=ordered:
-         *  The caller's filemap_fdatawrite() will write the data and
-         *  sync_inode() will write the inode if it is dirty.  Then the caller's
-         *  filemap_fdatawait() will wait on the pages.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode)) {
+        if (ext4_should_journal_data(inode))
-                ret = ext4_force_commit(inode->i_sb);
+                return ext4_force_commit(inode->i_sb);
-                goto out;
-        }
-        if (!journal)
+        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-                ret = sync_mapping_buffers(inode->i_mapping);
+        if (jbd2_log_start_commit(journal, commit_tid))
+                jbd2_log_wait_commit(journal, commit_tid);
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+        else if (journal->j_flags & JBD2_BARRIER)
-                goto out;
-        /*
-         * The VFS has written the file data.  If the inode is unaltered
-         * then we need not start a commit.
-         */
-        if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* sys_fsync did this */
-                };
-                err = sync_inode(inode, &wbc);
-                if (ret == 0)
-                        ret = err;
-        }
-out:
-        if (journal && (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..5352db1a3086 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 /*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
-        int err;
-        might_sleep();
-        BUFFER_TRACE(bh, "enter");
-        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %x\n",
-                  bh, is_metadata, inode->i_mode,
-                  test_opt(inode->i_sb, DATA_FLAGS));
-        /* Never use the revoke function if we are doing full data
-         * journaling: there is no need to, and a V1 superblock won't
-         * support it.  Otherwise, only skip the revoke on un-journaled
-         * data blocks. */
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (bh) {
-                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                        return ext4_journal_forget(handle, bh);
-                }
-                return 0;
-        }
-        /*
-         * data!=journal && (is_metadata || should_journal_data(inode))
-         */
-        BUFFER_TRACE(bh, "call ext4_journal_revoke");
-        err = ext4_journal_revoke(handle, blocknr, bh);
-        if (err)
-                ext4_abort(inode->i_sb, __func__,
-                           "error %d when attempting revoke", err);
-        BUFFER_TRACE(bh, "exit");
-        return err;
-}
-/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
@@ -721,7 +669,7 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
        return ret;
 }
@@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
+        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, branch[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        for (i = 0; i < indirect_blks; i++)
+        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
        return err;
 }
@@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, where[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
-                ext4_free_blocks(handle, inode,
+                 * need to revoke the block, which is why we don't
-                                        le32_to_cpu(where[i-1].key), 1, 0);
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                         blks, 0);
        return err;
 }
@@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                         partial, indirect_blks, count);
-        else
+        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
+        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
@@ -1052,7 +1012,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
                EXT4_I(inode)->i_reserved_meta_blocks;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return total;
+        return (total << inode->i_blkbits);
 }
 /*
 * Calculate the number of metadata blocks need to reserve
@@ -1534,6 +1494,16 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext4_journal_get_write_access(handle, bh);
 }
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext4_truncate(inode);
+}
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -1599,7 +1569,7 @@ retry:
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
@@ -1709,7 +1679,7 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1721,7 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1814,7 +1784,7 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -2600,7 +2570,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 }
 static int __ext4_journalled_writepage(struct page *page,
-                                       struct writeback_control *wbc,
                                       unsigned int len)
 {
        struct address_space *mapping = page->mapping;
@@ -2758,7 +2727,7 @@ static int ext4_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc, len);
+                return __ext4_journalled_writepage(page, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2788,7 +2757,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2933,7 +2902,7 @@ retry:
                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                        &mpd);
                /*
-                 * If we have a contigous extent of pages and we
+                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
@@ -3091,7 +3060,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -4064,7 +4033,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4120,6 +4089,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                              __le32 *last)
 {
        __le32 *p;
+        int     flags = EXT4_FREE_BLOCKS_FORGET;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4108,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                }
        }
-        /*
+        for (p = first; p < last; p++)
-         * Any buffers which are on the journal will be in memory. We
+                *p = 0;
-         * find them on the hash table so jbd2_journal_revoke() will
-         * run jbd2_journal_forget() on them.  We've already detached
-         * each block from the file, so bforget() in
-         * jbd2_journal_forget() should be safe.
-         *
-         * AKPM: turn on bforget in jbd2_journal_forget()!!!
-         */
-        for (p = first; p < last; p++) {
-                u32 nr = le32_to_cpu(*p);
-                if (nr) {
-                        struct buffer_head *tbh;
-                        *p = 0;
-                        tbh = sb_find_get_block(inode->i_sb, nr);
-                        ext4_forget(handle, 0, inode, tbh, nr);
-                }
-        }
-        ext4_free_blocks(handle, inode, block_to_free, count, 0);
+        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
 }
 /**
@@ -4342,7 +4299,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                            blocks_for_truncate(inode));
                        }
-                        ext4_free_blocks(handle, inode, nr, 1, 1);
+                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                        if (parent_bh) {
                                /*
@@ -4781,8 +4739,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
-        struct buffer_head *bh;
        struct inode *inode;
+        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
@@ -4793,11 +4751,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
+        iloc.bh = 0;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
                goto bad_inode;
-        bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4820,7 +4778,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                        brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
@@ -4848,11 +4805,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                spin_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                spin_unlock(&journal->j_state_lock);
+                ei->i_sync_tid = tid;
+                ei->i_datasync_tid = tid;
+        }
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                        brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
@@ -4884,10 +4865,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl <
+            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
-               EXT4_SB(sb)->s_gdb_count)) ||
-             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
                ext4_error(sb, __func__,
                           "bad extended attribute block %llu in inode #%lu",
                           ei->i_file_acl, inode->i_ino);
@@ -4905,10 +4883,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
-        if (ret) {
+        if (ret)
-                brelse(bh);
                goto bad_inode;
-        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -4936,7 +4912,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
-                brelse(bh);
                ret = -EIO;
                ext4_error(inode->i_sb, __func__,
                           "bogus i_mode (%o) for inode=%lu",
@@ -4949,6 +4924,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        return inode;
 bad_inode:
+        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
 }
@@ -5108,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
+        ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
@@ -5227,8 +5204,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -5376,7 +5353,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5429,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..b63d193126db 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,31 +221,38 @@ setversion_out:
                struct file *donor_filp;
                int err;
+                if (!(filp->f_mode & FMODE_READ) ||
+                    !(filp->f_mode & FMODE_WRITE))
+                        return -EBADF;
                if (copy_from_user(&me,
                        (struct move_extent __user *)arg, sizeof(me)))
                        return -EFAULT;
+                me.moved_len = 0;
                donor_filp = fget(me.donor_fd);
                if (!donor_filp)
                        return -EBADF;
-                if (!capable(CAP_DAC_OVERRIDE)) {
+                if (!(donor_filp->f_mode & FMODE_WRITE)) {
-                        if ((current->real_cred->fsuid != inode->i_uid) ||
+                        err = -EBADF;
-                                !(inode->i_mode & S_IRUSR) ||
+                        goto mext_out;
-                                !(donor_filp->f_dentry->d_inode->i_mode &
-                                S_IRUSR)) {
-                                fput(donor_filp);
-                                return -EACCES;
-                        }
                }
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        goto mext_out;
                err = ext4_move_extents(filp, donor_filp, me.orig_start,
                                        me.donor_start, me.len, &me.moved_len);
-                fput(donor_filp);
+                mnt_drop_write(filp->f_path.mnt);
+                if (me.moved_len > 0)
+                        file_remove_suid(donor_filp);
                if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
-                        return -EFAULT;
+                        err = -EFAULT;
+mext_out:
+                fput(donor_filp);
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..c1e19d5b5985 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
-        ext4_fsblk_t discard_block;
        struct list_head *l, *ltmp;
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+                if (test_opt(sb, DISCARD)) {
-                        + entry->start_blk
+                        ext4_fsblk_t discard_block;
-                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-                trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                                          entry->count);
+                        discard_block = (ext4_fsblk_t)entry->group *
-                sb_issue_discard(sb, discard_block, entry->count);
+                                                EXT4_BLOCKS_PER_GROUP(sb)
+                                        + entry->start_blk
+                                        + le32_to_cpu(es->s_first_data_block);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        sb_issue_discard(sb, discard_block, entry->count);
+                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
@@ -3006,6 +3011,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 }
 /*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+        struct ext4_prealloc_space *pa = ac->ac_pa;
+        int len;
+        if (pa && pa->pa_type == MB_INODE_PA) {
+                len = ac->ac_b_ex.fe_len;
+                pa->pa_free += len;
+        }
+}
+/*
 * use blocks preallocated to inode
 */
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -4290,6 +4313,7 @@ repeat:
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
                } else if (*errp) {
+                        ext4_discard_allocated_blocks(ac);
                        ac->ac_b_ex.fe_len = 0;
                        ar->len = 0;
                        ext4_mb_show_ac(ac);
@@ -4422,18 +4446,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        return 0;
 }
-/*
+/**
- * Main entry point into mballoc to free blocks
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:             handle for this transaction
+ * @inode:              inode
+ * @block:              start physical block to free
+ * @count:              number of blocks to count
+ * @metadata:           Are these metadata blocks
 */
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
+                      struct buffer_head *bh, ext4_fsblk_t block,
-                        int metadata, unsigned long *freed)
+                      unsigned long count, int flags)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
+        unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
@@ -4443,13 +4473,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        int err = 0;
        int ret;
-        *freed = 0;
+        if (bh) {
+                if (block)
+                        BUG_ON(block != bh->b_blocknr);
+                else
+                        block = bh->b_blocknr;
+        }
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-        if (block < le32_to_cpu(es->s_first_data_block) ||
+        if (!ext4_data_block_valid(sbi, block, count)) {
-            block + count < block ||
-            block + count > ext4_blocks_count(es)) {
                ext4_error(sb, __func__,
                            "Freeing blocks not in datazone - "
                            "block = %llu, count = %lu", block, count);
@@ -4457,7 +4490,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        }
        ext4_debug("freeing block %llu\n", block);
-        trace_ext4_free_blocks(inode, block, count, metadata);
+        trace_ext4_free_blocks(inode, block, count, flags);
+        if (flags & EXT4_FREE_BLOCKS_FORGET) {
+                struct buffer_head *tbh = bh;
+                int i;
+                BUG_ON(bh && (count > 1));
+                for (i = 0; i < count; i++) {
+                        if (!bh)
+                                tbh = sb_find_get_block(inode->i_sb,
+                                                        block + i);
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                                    inode, tbh, block + i);
+                }
+        }
+        /* 
+         * We need to make sure we don't reuse the freed block until
+         * after the transaction is committed, which we can do by
+         * treating the block as metadata, below.  We make an
+         * exception if the inode is to be written in writeback mode
+         * since writeback mode has weak data consistency guarantees.
+         */
+        if (!ext4_should_writeback_data(inode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4533,7 +4591,8 @@ do_more:
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_return;
-        if (metadata && ext4_handle_valid(handle)) {
+        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                struct ext4_free_data *new_entry;
                /*
                 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4631,7 @@ do_more:
        ext4_mb_release_desc(&e4b);
-        *freed += count;
+        freed += count;
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4651,8 @@ do_more:
        }
        sb->s_dirt = 1;
 error_return:
+        if (freed)
+                vfs_dq_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..81415814b00b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
         * So allocate a credit of 3. We may update
         * quota (user and group).
         */
-        needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+        needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
        if (ext4_journal_extend(handle, needed) != 0)
                retval = ext4_journal_restart(handle, needed);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode,
+                        ext4_free_blocks(handle, inode, 0,
-                                        le32_to_cpu(tmp_idata[i]), 1, 1);
+                                         le32_to_cpu(tmp_idata[i]), 1,
+                                         EXT4_FREE_BLOCKS_METADATA |
+                                         EXT4_FREE_BLOCKS_FORGET);
                }
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode,
+                ext4_free_blocks(handle, inode, 0,
-                                le32_to_cpu(i_data[0]), 1, 1);
+                                le32_to_cpu(i_data[0]), 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
        /* ei->i_data[EXT4_DIND_BLOCK] */
@@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, block, 1, 1);
+        ext4_free_blocks(handle, inode, 0, block, 1,
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
        handle = ext4_journal_start(inode,
                                        EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
                                        + 1);
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..82c415be87a4 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
+        struct ext4_extent_header *eh;
        int ppos, leaf_ppos = path->p_depth;
        ppos = leaf_ppos;
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                        ext_block_hdr(path[cur_ppos+1].p_bh);
                        }
+                        path[leaf_ppos].p_ext = *extent = NULL;
+                        eh = path[leaf_ppos].p_hdr;
+                        if (le16_to_cpu(eh->eh_entries) == 0)
+                                /* empty leaf is found */
+                                return -ENODATA;
                        /* leaf block */
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                        path[leaf_ppos].p_block =
+                                        ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 }
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
- *
- * @orig_inode:         original inode structure
- * @donor_inode:        donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-        struct inode *first = orig_inode, *second = donor_inode;
-        /*
-         * Use the inode number to provide the stable locking order instead
-         * of its address, because the C language doesn't guarantee you can
-         * compare pointers that don't come from the same array.
-         */
-        if (donor_inode->i_ino < orig_inode->i_ino) {
-                first = donor_inode;
-                second = orig_inode;
-        }
-        down_read(&EXT4_I(first)->i_data_sem);
-        down_read(&EXT4_I(second)->i_data_sem);
-}
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
 *
 * @orig_inode:         original inode structure
 * @donor_inode:        donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
 */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
        }
        down_write(&EXT4_I(first)->i_data_sem);
-        down_write(&EXT4_I(second)->i_data_sem);
+        down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
 }
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:         original inode structure to be released its lock first
 * @donor_inode:        donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
 */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
-{
-        up_read(&EXT4_I(orig_inode)->i_data_sem);
-        up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:         original inode structure to be released its lock first
- * @donor_inode:        donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -596,7 +568,7 @@ out:
 * @tmp_oext:           the extent that will belong to the donor inode
 * @orig_off:           block offset of original inode
 * @donor_off:          block offset of donor inode
- * @max_count:          the maximun length of extents
+ * @max_count:          the maximum length of extents
 *
 * Return 0 on success, or a negative error value on failure.
 */
@@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * @donor_inode:        donor inode
 * @from:               block offset of orig_inode
 * @count:              block count to be replaced
+ * @err:                pointer to save return value
 *
 * Replace original inode extents and donor inode extents page by page.
 * We implement this replacement in the following three steps:
@@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * 3. Change the block information of donor inode to point at the saved
 *    original inode blocks in the dummy extents.
 *
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
 */
 static int
 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                           struct inode *donor_inode, ext4_lblk_t from,
-                           ext4_lblk_t count)
+                           ext4_lblk_t count, int *err)
 {
        struct ext4_ext_path *orig_path = NULL;
        struct ext4_ext_path *donor_path = NULL;
        struct ext4_extent *oext, *dext;
        struct ext4_extent tmp_dext, tmp_oext;
        ext4_lblk_t orig_off = from, donor_off = from;
-        int err = 0;
        int depth;
        int replaced_count = 0;
        int dext_alen;
-        mext_double_down_write(orig_inode, donor_inode);
+        /* Protect extent trees against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Get the original extent for the block "orig_off" */
-        err = get_ext_path(orig_inode, orig_off, &orig_path);
+        *err = get_ext_path(orig_inode, orig_off, &orig_path);
-        if (err)
+        if (*err)
                goto out;
        /* Get the donor extent for the head */
-        err = get_ext_path(donor_inode, donor_off, &donor_path);
+        *err = get_ext_path(donor_inode, donor_off, &donor_path);
-        if (err)
+        if (*err)
                goto out;
        depth = ext_depth(orig_inode);
        oext = orig_path[depth].p_ext;
@@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        dext = donor_path[depth].p_ext;
        tmp_dext = *dext;
-        err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+        *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                      donor_off, count);
-        if (err)
+        if (*err)
                goto out;
        /* Loop for the donor extents */
@@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (!dext) {
                        ext4_error(donor_inode->i_sb, __func__,
                                   "The extent for donor must be found");
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
                        ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                                "extent(%u) should be equal",
                                donor_off,
                                le32_to_cpu(tmp_dext.ee_block));
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                }
                /* Set donor extent to orig extent */
-                err = mext_leaf_block(handle, orig_inode,
+                *err = mext_leaf_block(handle, orig_inode,
                                           orig_path, &tmp_dext, &orig_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                /* Set orig extent to donor extent */
-                err = mext_leaf_block(handle, donor_inode,
+                *err = mext_leaf_block(handle, donor_inode,
                                           donor_path, &tmp_oext, &donor_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (orig_path)
                        ext4_ext_drop_refs(orig_path);
-                err = get_ext_path(orig_inode, orig_off, &orig_path);
+                *err = get_ext_path(orig_inode, orig_off, &orig_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(orig_inode);
                oext = orig_path[depth].p_ext;
-                if (le32_to_cpu(oext->ee_block) +
-                                ext4_ext_get_actual_len(oext) <= orig_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_oext = *oext;
                if (donor_path)
                        ext4_ext_drop_refs(donor_path);
-                err = get_ext_path(donor_inode, donor_off, &donor_path);
+                *err = get_ext_path(donor_inode, donor_off, &donor_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(donor_inode);
                dext = donor_path[depth].p_ext;
-                if (le32_to_cpu(dext->ee_block) +
-                                ext4_ext_get_actual_len(dext) <= donor_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_dext = *dext;
-                err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                           donor_off, count - replaced_count);
-                if (err)
+                if (*err)
                        goto out;
        }
@@ -795,8 +758,12 @@ out:
                kfree(donor_path);
        }
-        mext_double_up_write(orig_inode, donor_inode);
+        ext4_ext_invalidate_cache(orig_inode);
-        return err;
+        ext4_ext_invalidate_cache(donor_inode);
+        double_up_write_data_sem(orig_inode, donor_inode);
+        return replaced_count;
 }
 /**
@@ -808,16 +775,17 @@ out:
 * @data_offset_in_page:        block index where data swapping starts
 * @block_len_in_page:          the number of blocks to be swapped
 * @uninit:                     orig extent is uninitialized or not
+ * @err:                        pointer to save return value
 *
 * Save the data in original inode blocks and replace original inode extents
 * with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
+ * Finally, write out the saved data in new original inode blocks. Return
- * on success, or a negative error value on failure.
+ * replaced block count.
 */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                  pgoff_t orig_page_offset, int data_offset_in_page,
-                  int block_len_in_page, int uninit)
+                  int block_len_in_page, int uninit, int *err)
 {
        struct inode *orig_inode = o_filp->f_dentry->d_inode;
        struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
-        unsigned int tmp_data_len, data_len;
+        unsigned int tmp_data_size, data_size, replaced_size;
        void *fsdata;
-        int ret, i, jblocks;
+        int i, jblocks;
+        int err2 = 0;
+        int replaced_count = 0;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
        /*
@@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
        handle = ext4_journal_start(orig_inode, jblocks);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                *err = PTR_ERR(handle);
-                return ret;
+                return 0;
        }
        if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * Just swap data blocks between orig and donor.
         */
        if (uninit) {
-                ret = mext_replace_branches(handle, orig_inode,
+                replaced_count = mext_replace_branches(handle, orig_inode,
-                                                 donor_inode, orig_blk_offset,
+                                                donor_inode, orig_blk_offset,
-                                                 block_len_in_page);
+                                                block_len_in_page, err);
-                /* Clear the inode cache not to refer to the old data */
-                ext4_ext_invalidate_cache(orig_inode);
-                ext4_ext_invalidate_cache(donor_inode);
                goto out2;
        }
        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-        /* Calculate data_len */
+        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
                /* Replace the last block */
-                tmp_data_len = orig_inode->i_size & (blocksize - 1);
+                tmp_data_size = orig_inode->i_size & (blocksize - 1);
                /*
-                 * If data_len equal zero, it shows data_len is multiples of
+                 * If data_size equal zero, it shows data_size is multiples of
                 * blocksize. So we set appropriate value.
                 */
-                if (tmp_data_len == 0)
+                if (tmp_data_size == 0)
-                        tmp_data_len = blocksize;
+                        tmp_data_size = blocksize;
-                data_len = tmp_data_len +
+                data_size = tmp_data_size +
                        ((block_len_in_page - 1) << orig_inode->i_blkbits);
-        } else {
+        } else
-                data_len = block_len_in_page << orig_inode->i_blkbits;
+                data_size = block_len_in_page << orig_inode->i_blkbits;
-        }
+        replaced_size = data_size;
-        ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+        *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
                                 &page, &fsdata);
-        if (unlikely(ret < 0))
+        if (unlikely(*err < 0))
                goto out;
        if (!PageUptodate(page)) {
@@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
-        ret = mext_replace_branches(handle, orig_inode, donor_inode,
+        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-                                         orig_blk_offset, block_len_in_page);
+                                        orig_blk_offset, block_len_in_page,
-        if (ret < 0)
+                                        &err2);
-                goto out;
+        if (err2) {
+                if (replaced_count) {
-        /* Clear the inode cache not to refer to the old data */
+                        block_len_in_page = replaced_count;
-        ext4_ext_invalidate_cache(orig_inode);
+                        replaced_size =
-        ext4_ext_invalidate_cache(donor_inode);
+                                block_len_in_page << orig_inode->i_blkbits;
+                } else
+                        goto out;
+        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                bh = bh->b_this_page;
        for (i = 0; i < block_len_in_page; i++) {
-                ret = ext4_get_block(orig_inode,
+                *err = ext4_get_block(orig_inode,
                                (sector_t)(orig_blk_offset + i), bh, 0);
-                if (ret < 0)
+                if (*err < 0)
                        goto out;
                if (bh->b_this_page != NULL)
                        bh = bh->b_this_page;
        }
-        ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+        *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
                               page, fsdata);
        page = NULL;
@@ -951,7 +921,10 @@ out:
 out2:
        ext4_journal_stop(handle);
-        return ret < 0 ? ret : 0;
+        if (err2)
+                *err = err2;
+        return replaced_count;
 }
 /**
@@ -962,7 +935,6 @@ out2:
 * @orig_start:         logical start offset in block for orig
 * @donor_start:        logical start offset in block for donor
 * @len:                the number of blocks to be moved
- * @moved_len:          moved block length
 *
 * Check the arguments of ext4_move_extents() whether the files can be
 * exchanged with each other.
@@ -970,8 +942,8 @@ out2:
 */
 static int
 mext_check_arguments(struct inode *orig_inode,
-                          struct inode *donor_inode, __u64 orig_start,
+                     struct inode *donor_inode, __u64 orig_start,
-                          __u64 donor_start, __u64 *len, __u64 moved_len)
+                     __u64 donor_start, __u64 *len)
 {
        ext4_lblk_t orig_blocks, donor_blocks;
        unsigned int blkbits = orig_inode->i_blkbits;
@@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
+        if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+                ext4_debug("ext4 move extent: suid or sgid is set"
+                           " to donor file [ino:orig %lu, donor %lu]\n",
+                           orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
        /* Ext4 move extent does not support swapfile */
        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
                ext4_debug("ext4 move extent: The argument files should "
@@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if (moved_len) {
-                ext4_debug("ext4 move extent: moved_len should be 0 "
-                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
-                        donor_inode->i_ino);
-                return -EINVAL;
-        }
        if ((orig_start > EXT_MAX_BLOCK) ||
            (donor_start > EXT_MAX_BLOCK) ||
            (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode,
        }
        if (!*len) {
-                ext4_debug("ext4 move extent: len shoudld not be 0 "
+                ext4_debug("ext4 move extent: len should not be 0 "
                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
                        donor_inode->i_ino);
                return -EINVAL;
@@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
-        /* protect orig and donor against a truncate */
+        /* Protect orig and donor inodes against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
                return ret1;
-        mext_double_down_read(orig_inode, donor_inode);
+        /* Protect extent tree against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
-                                        donor_start, &len, *moved_len);
+                                    donor_start, &len);
-        mext_double_up_read(orig_inode, donor_inode);
        if (ret1)
                goto out;
@@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                seq_start = le32_to_cpu(ext_cur->ee_block);
                rest_blocks = seq_blocks;
-                /* Discard preallocations of two inodes */
+                /*
-                down_write(&EXT4_I(orig_inode)->i_data_sem);
+                 * Up semaphore to avoid following problems:
-                ext4_discard_preallocations(orig_inode);
+                 * a. transaction deadlock among ext4_journal_start,
-                up_write(&EXT4_I(orig_inode)->i_data_sem);
+                 *    ->write_begin via pagefault, and jbd2_journal_commit
+                 * b. racing with ->readpage, ->write_begin, and ext4_get_block
-                down_write(&EXT4_I(donor_inode)->i_data_sem);
+                 *    in move_extent_per_page
-                ext4_discard_preallocations(donor_inode);
+                 */
-                up_write(&EXT4_I(donor_inode)->i_data_sem);
+                double_up_write_data_sem(orig_inode, donor_inode);
                while (orig_page_offset <= seq_end_page) {
                        /* Swap original branches with new branches */
-                        ret1 = move_extent_per_page(o_filp, donor_inode,
+                        block_len_in_page = move_extent_per_page(
+                                                o_filp, donor_inode,
                                                orig_page_offset,
                                                data_offset_in_page,
-                                                block_len_in_page, uninit);
+                                                block_len_in_page, uninit,
-                        if (ret1 < 0)
+                                                &ret1);
-                                goto out;
-                        orig_page_offset++;
                        /* Count how many blocks we have exchanged */
                        *moved_len += block_len_in_page;
+                        if (ret1 < 0)
+                                break;
                        if (*moved_len > len) {
                                ext4_error(orig_inode->i_sb, __func__,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
                                ret1 = -EIO;
-                                goto out;
+                                break;
                        }
+                        orig_page_offset++;
                        data_offset_in_page = 0;
                        rest_blocks -= block_len_in_page;
                        if (rest_blocks > blocks_per_page)
@@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
+                double_down_write_data_sem(orig_inode, donor_inode);
+                if (ret1 < 0)
+                        break;
                /* Decrease buffer counter */
                if (holecheck_path)
                        ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        }
 out:
+        if (*moved_len) {
+                ext4_discard_preallocations(orig_inode);
+                ext4_discard_preallocations(donor_inode);
+        }
        if (orig_path) {
                ext4_ext_drop_refs(orig_path);
                kfree(orig_path);
@@ -1422,7 +1406,7 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
+        double_up_write_data_sem(orig_inode, donor_inode);
        ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
        if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..17a17e10dd60 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
 */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-                                                  bh, offset)) {
+                                                  bh, offset))
-                                brelse(bh);
                                return -EIO;
-                        }
+                        if (ext4_match(namelen, name, de))
-                        if (ext4_match(namelen, name, de)) {
-                                brelse(bh);
                                return -EEXIST;
-                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                ext4_std_error(dir->i_sb, err);
-                brelse(bh);
                return err;
        }
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
-        brelse(bh);
        return 0;
 }
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        if (!(de))
                return retval;
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                if(!bh)
                        return retval;
                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-                if (retval != -ENOSPC)
+                if (retval != -ENOSPC) {
+                        brelse(bh);
                        return retval;
+                }
                if (blocks == 1 && !dx_fallback &&
                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                goto journal_error;
        err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-        if (err != -ENOSPC) {
+        if (err != -ENOSPC)
-                bh = NULL;
                goto cleanup;
-        }
        /* Block full, should compress but for now just split */
        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        if (!de)
                goto cleanup;
        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-        bh = NULL;
        goto cleanup;
 journal_error:
@@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..3b2c5541d8a6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        goto exit_bh;
                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(bh);
+                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
                ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..8b58a144c31b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb)
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
-        ext4_release_system_zone(sb);
-        ext4_mb_release(sb);
-        ext4_ext_release(sb);
-        ext4_xattr_put_super(sb);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, __func__,
                                   "Couldn't clean up the journal");
        }
+        ext4_release_system_zone(sb);
+        ext4_mb_release(sb);
+        ext4_ext_release(sb);
+        ext4_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -704,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        spin_lock_init(&(ei->i_block_reservation_lock));
        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
        ei->cur_aio_dio = NULL;
+        ei->i_sync_tid = 0;
+        ei->i_datasync_tid = 0;
        return &ei->vfs_inode;
 }
@@ -899,6 +903,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NO_AUTO_DA_ALLOC))
                seq_puts(seq, ",noauto_da_alloc");
+        if (test_opt(sb, DISCARD))
+                seq_puts(seq, ",discard");
+        if (test_opt(sb, NOLOAD))
+                seq_puts(seq, ",norecovery");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1079,7 +1089,8 @@ enum {
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_block_validity, Opt_noblock_validity,
-        Opt_inode_readahead_blks, Opt_journal_ioprio
+        Opt_inode_readahead_blks, Opt_journal_ioprio,
+        Opt_discard, Opt_nodiscard,
 };
 static const match_table_t tokens = {
@@ -1104,6 +1115,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_noload, "noload"},
+        {Opt_noload, "norecovery"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
@@ -1144,6 +1156,8 @@ static const match_table_t tokens = {
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc"},
        {Opt_noauto_da_alloc, "noauto_da_alloc"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL},
 };
@@ -1565,6 +1579,12 @@ set_qf_format:
                        else
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
+                case Opt_discard:
+                        set_opt(sbi->s_mount_opt, DISCARD);
+                        break;
+                case Opt_nodiscard:
+                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1673,14 +1693,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size_t size;
        int i;
-        if (!sbi->s_es->s_log_groups_per_flex) {
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        if (groups_per_flex < 2) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
-        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
        /* We allocate both existing and potentially added groups */
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2721,26 +2741,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext4_load_journal(sb, es, journal_devnum))
                        goto failed_mount3;
-                if (!(sb->s_flags & MS_RDONLY) &&
-                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        ext4_msg(sb, KERN_CRIT, "error: "
-                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt",
-                               EXT4_SB(sb)->s_journal->j_failed_commit);
-                        if (test_opt(sb, ERRORS_RO)) {
-                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only");
-                                sb->s_flags |= MS_RDONLY;
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                        }
-                        if (test_opt(sb, ERRORS_PANIC)) {
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, 1);
-                                goto failed_mount4;
-                        }
-                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -3668,13 +3668,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
-        ext4_free_blocks_count_set(es, buf->f_bfree);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT4_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3966,6 +3964,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
+#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static inline void register_as_ext2(void)
+{
+        int err = register_filesystem(&ext2_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+static inline void unregister_as_ext2(void)
+{
+        unregister_filesystem(&ext2_fs_type);
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+#endif
+#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext3",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static inline void register_as_ext3(void)
+{
+        int err = register_filesystem(&ext3_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+static inline void unregister_as_ext3(void)
+{
+        unregister_filesystem(&ext3_fs_type);
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+#endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3995,11 +4045,15 @@ static int __init init_ext4_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
+        register_as_ext2();
+        register_as_ext3();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
        return 0;
 out:
+        unregister_as_ext2();
+        unregister_as_ext3();
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
@@ -4015,6 +4069,8 @@ out4:
 static void __exit exit_ext4_fs(void)
 {
+        unregister_as_ext2();
+        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..910bf9a59cb3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
-                ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
                get_bh(bh);
-                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+                ext4_free_blocks(handle, inode, bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
                error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -832,7 +833,8 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, block, 1, 1);
+                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
                        }
@@ -988,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (error)
                goto cleanup;
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto cleanup;
        if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1013,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }
-        error = ext4_journal_get_write_access(handle, is.iloc.bh);
-        if (error)
-                goto cleanup;
        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e009..4bef4c01ec6f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -280,7 +279,6 @@ void __fput(struct file *file)
        if (file->f_op && file->f_op->release)
                file->f_op->release(inode, file);
        security_file_free(file);
-        ima_file_free(file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..49bc1b8e8f19 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                                struct writeback_control *wbc)
 {
        struct super_block *sb = wbc->sb, *pin_sb = NULL;
-        const int is_blkdev_sb = sb_is_blkdev_sb(sb);
        const unsigned long start = jiffies;    /* livelock avoidance */
        spin_lock(&inode_lock);
@@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                        continue;
                }
-                if (!bdi_cap_writeback_dirty(wb->bdi)) {
-                        redirty_tail(inode);
-                        if (is_blkdev_sb) {
-                                /*
-                                 * Dirty memory-backed blockdev: the ramdisk
-                                 * driver does this.  Skip just this inode
-                                 */
-                                continue;
-                        }
-                        /*
-                         * Dirty memory-backed inode against a filesystem other
-                         * than the kernel-internal bdev filesystem.  Skip the
-                         * entire superblock.
-                         */
-                        break;
-                }
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
-                if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
-                        wbc->encountered_congestion = 1;
-                        if (!is_blkdev_sb)
-                                break;          /* Skip a congested fs */
-                        requeue_io(inode);
-                        continue;               /* Skip a congested blockdev */
-                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
                .for_kupdate            = args->for_kupdate,
+                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
        };
        unsigned long oldest_jif;
@@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
                writeback_inodes_wb(wb, &wbc);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 9bbb8ce7bea0..864dac20a242 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -54,3 +54,10 @@ config FSCACHE_DEBUG
          enabled by setting bits in /sys/modules/fscache/parameter/debug.
          See Documentation/filesystems/caching/fscache.txt for more information.
+config FSCACHE_OBJECT_LIST
+        bool "Maintain global object list for debugging purposes"
+        depends on FSCACHE && PROC_FS
+        help
+          Maintain a global list of active fscache objects that can be
+          retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 91571b95aacc..6d561531cb36 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -15,5 +15,6 @@ fscache-y := \
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
 fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
 obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index e21985bbb1fb..6a3c48abd677 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache,
        spin_lock(&cache->object_list_lock);
        list_add_tail(&ifsdef->cache_link, &cache->object_list);
        spin_unlock(&cache->object_list_lock);
+        fscache_objlist_add(ifsdef);
        /* add the cache's netfs definition index object to the top level index
         * cookie as a known backing object */
@@ -380,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
        /* make sure all pages pinned by operations on behalf of the netfs are
         * written to disk */
+        fscache_stat(&fscache_n_cop_sync_cache);
        cache->ops->sync_cache(cache);
+        fscache_stat_d(&fscache_n_cop_sync_cache);
        /* dissociate all the netfs pages backed by this cache from the block
         * mappings in the cache */
+        fscache_stat(&fscache_n_cop_dissociate_pages);
        cache->ops->dissociate_pages(cache);
+        fscache_stat_d(&fscache_n_cop_dissociate_pages);
        /* we now have to destroy all the active objects pertaining to this
         * cache - which we do by passing them off to thread pool to be
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 72fd18f6c71f..990535071a8a 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie)
        memset(cookie, 0, sizeof(*cookie));
        spin_lock_init(&cookie->lock);
+        spin_lock_init(&cookie->stores_lock);
        INIT_HLIST_HEAD(&cookie->backing_objects);
 }
@@ -102,7 +103,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
        cookie->netfs_data      = netfs_data;
        cookie->flags           = 0;
-        INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+        /* radix tree insertion won't use the preallocation pool unless it's
+         * told it may not wait */
+        INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
        switch (cookie->def->type) {
        case FSCACHE_COOKIE_TYPE_INDEX:
@@ -249,7 +252,9 @@ static int fscache_alloc_object(struct fscache_cache *cache,
        /* ask the cache to allocate an object (we may end up with duplicate
         * objects at this stage, but we sort that out later) */
+        fscache_stat(&fscache_n_cop_alloc_object);
        object = cache->ops->alloc_object(cache, cookie);
+        fscache_stat_d(&fscache_n_cop_alloc_object);
        if (IS_ERR(object)) {
                fscache_stat(&fscache_n_object_no_alloc);
                ret = PTR_ERR(object);
@@ -270,8 +275,11 @@ static int fscache_alloc_object(struct fscache_cache *cache,
        /* only attach if we managed to allocate all we needed, otherwise
         * discard the object we just allocated and instead use the one
         * attached to the cookie */
-        if (fscache_attach_object(cookie, object) < 0)
+        if (fscache_attach_object(cookie, object) < 0) {
+                fscache_stat(&fscache_n_cop_put_object);
                cache->ops->put_object(object);
+                fscache_stat_d(&fscache_n_cop_put_object);
+        }
        _leave(" = 0");
        return 0;
@@ -287,7 +295,9 @@ object_already_extant:
        return 0;
 error_put:
+        fscache_stat(&fscache_n_cop_put_object);
        cache->ops->put_object(object);
+        fscache_stat_d(&fscache_n_cop_put_object);
 error:
        _leave(" = %d", ret);
        return ret;
@@ -349,6 +359,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
        object->cookie = cookie;
        atomic_inc(&cookie->usage);
        hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+        fscache_objlist_add(object);
        ret = 0;
 cant_attach_object:
@@ -403,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
        unsigned long event;
        fscache_stat(&fscache_n_relinquishes);
+        if (retire)
+                fscache_stat(&fscache_n_relinquishes_retire);
        if (!cookie) {
                fscache_stat(&fscache_n_relinquishes_null);
@@ -428,12 +442,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
        event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
-        /* detach pointers back to the netfs */
        spin_lock(&cookie->lock);
-        cookie->netfs_data      = NULL;
-        cookie->def             = NULL;
        /* break links with all the active objects */
        while (!hlist_empty(&cookie->backing_objects)) {
                object = hlist_entry(cookie->backing_objects.first,
@@ -456,6 +466,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
                        BUG();
        }
+        /* detach pointers back to the netfs */
+        cookie->netfs_data      = NULL;
+        cookie->def             = NULL;
        spin_unlock(&cookie->lock);
        if (cookie->parent) {
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1c341304621f..edd7434ab6e5 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -17,6 +17,7 @@
 * - cache->object_list_lock
 * - object->lock
 * - object->parent->lock
+ * - cookie->stores_lock
 * - fscache_thread_lock
 *
 */
@@ -88,17 +89,31 @@ extern int fscache_wait_bit_interruptible(void *);
 /*
 * object.c
 */
+extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
 extern void fscache_withdrawing_object(struct fscache_cache *,
                                       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 /*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+extern void fscache_objlist_add(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#endif
+/*
 * operation.c
 */
 extern int fscache_submit_exclusive_op(struct fscache_object *,
                                       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
                             struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
 extern void fscache_operation_gc(struct work_struct *);
@@ -127,6 +142,8 @@ extern atomic_t fscache_n_op_enqueue;
 extern atomic_t fscache_n_op_deferred_release;
 extern atomic_t fscache_n_op_release;
 extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_op_cancelled;
+extern atomic_t fscache_n_op_rejected;
 extern atomic_t fscache_n_attr_changed;
 extern atomic_t fscache_n_attr_changed_ok;
@@ -138,6 +155,8 @@ extern atomic_t fscache_n_allocs;
 extern atomic_t fscache_n_allocs_ok;
 extern atomic_t fscache_n_allocs_wait;
 extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
 extern atomic_t fscache_n_alloc_ops;
 extern atomic_t fscache_n_alloc_op_waits;
@@ -148,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata;
 extern atomic_t fscache_n_retrievals_nobufs;
 extern atomic_t fscache_n_retrievals_intr;
 extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
 extern atomic_t fscache_n_retrieval_ops;
 extern atomic_t fscache_n_retrieval_op_waits;
@@ -158,6 +178,14 @@ extern atomic_t fscache_n_stores_nobufs;
 extern atomic_t fscache_n_stores_oom;
 extern atomic_t fscache_n_store_ops;
 extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
@@ -176,6 +204,7 @@ extern atomic_t fscache_n_updates_run;
 extern atomic_t fscache_n_relinquishes;
 extern atomic_t fscache_n_relinquishes_null;
 extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_relinquishes_retire;
 extern atomic_t fscache_n_cookie_index;
 extern atomic_t fscache_n_cookie_data;
@@ -186,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc;
 extern atomic_t fscache_n_object_lookups;
 extern atomic_t fscache_n_object_lookups_negative;
 extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
 extern atomic_t fscache_n_object_created;
 extern atomic_t fscache_n_object_avail;
 extern atomic_t fscache_n_object_dead;
@@ -195,15 +225,41 @@ extern atomic_t fscache_n_checkaux_okay;
 extern atomic_t fscache_n_checkaux_update;
 extern atomic_t fscache_n_checkaux_obsolete;
+extern atomic_t fscache_n_cop_alloc_object;
+extern atomic_t fscache_n_cop_lookup_object;
+extern atomic_t fscache_n_cop_lookup_complete;
+extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_update_object;
+extern atomic_t fscache_n_cop_drop_object;
+extern atomic_t fscache_n_cop_put_object;
+extern atomic_t fscache_n_cop_sync_cache;
+extern atomic_t fscache_n_cop_attr_changed;
+extern atomic_t fscache_n_cop_read_or_alloc_page;
+extern atomic_t fscache_n_cop_read_or_alloc_pages;
+extern atomic_t fscache_n_cop_allocate_page;
+extern atomic_t fscache_n_cop_allocate_pages;
+extern atomic_t fscache_n_cop_write_page;
+extern atomic_t fscache_n_cop_uncache_page;
+extern atomic_t fscache_n_cop_dissociate_pages;
 static inline void fscache_stat(atomic_t *stat)
 {
        atomic_inc(stat);
 }
+static inline void fscache_stat_d(atomic_t *stat)
+{
+        atomic_dec(stat);
+}
+#define __fscache_stat(stat) (stat)
 extern const struct file_operations fscache_stats_fops;
 #else
+#define __fscache_stat(stat) (NULL)
 #define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
 #endif
 /*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b597499..add6bdb53f04 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
 {
        int ret;
-        ret = slow_work_register_user();
+        ret = slow_work_register_user(THIS_MODULE);
        if (ret < 0)
                goto error_slow_work;
@@ -80,7 +80,7 @@ error_kobj:
 error_cookie_jar:
        fscache_proc_cleanup();
 error_proc:
-        slow_work_unregister_user();
+        slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
        return ret;
 }
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
        kobject_put(fscache_root);
        kmem_cache_destroy(fscache_cookie_jar);
        fscache_proc_cleanup();
-        slow_work_unregister_user();
+        slow_work_unregister_user(THIS_MODULE);
        printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 000000000000..e590242fa41a
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+struct fscache_objlist_data {
+        unsigned long   config;         /* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY      0x00000001      /* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX      0x00000002      /* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE   0x00000004      /* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008      /* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY     0x00000010      /* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE     0x00000020      /* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR   0x00000040      /* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080      /* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS    0x00000100      /* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS  0x00000200      /* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS   0x00000400      /* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800      /* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK     0x00001000      /* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK   0x00002000      /* show objects without slow work */
+        u8              buf[512];       /* key and aux data buffer */
+};
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ *   tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+        struct fscache_object *xobj;
+        struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+        write_lock(&fscache_object_list_lock);
+        while (*p) {
+                parent = *p;
+                xobj = rb_entry(parent, struct fscache_object, objlist_link);
+                if (obj < xobj)
+                        p = &(*p)->rb_left;
+                else if (obj > xobj)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&obj->objlist_link, parent, p);
+        rb_insert_color(&obj->objlist_link, &fscache_object_list);
+        write_unlock(&fscache_object_list_lock);
+}
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *obj)
+{
+        write_lock(&fscache_object_list_lock);
+        BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+        rb_erase(&obj->objlist_link, &fscache_object_list);
+        write_unlock(&fscache_object_list_lock);
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+        struct fscache_object *pobj, *obj, *minobj = NULL;
+        struct rb_node *p;
+        unsigned long pos;
+        if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+                return NULL;
+        pos = *_pos;
+        /* banners (can't represent line 0 by pos 0 as that would involve
+         * returning a NULL pointer) */
+        if (pos == 0)
+                return (struct fscache_object *) ++(*_pos);
+        if (pos < 3)
+                return (struct fscache_object *)pos;
+        pobj = (struct fscache_object *)pos;
+        p = fscache_object_list.rb_node;
+        while (p) {
+                obj = rb_entry(p, struct fscache_object, objlist_link);
+                if (pobj < obj) {
+                        if (!minobj || minobj > obj)
+                                minobj = obj;
+                        p = p->rb_left;
+                } else if (pobj > obj) {
+                        p = p->rb_right;
+                } else {
+                        minobj = obj;
+                        break;
+                }
+                obj = NULL;
+        }
+        if (!minobj)
+                *_pos = (unsigned long) ERR_PTR(-ENOENT);
+        else if (minobj != obj)
+                *_pos = (unsigned long) minobj;
+        return minobj;
+}
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+        __acquires(&fscache_object_list_lock)
+{
+        read_lock(&fscache_object_list_lock);
+        return fscache_objlist_lookup(_pos);
+}
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+        (*_pos)++;
+        return fscache_objlist_lookup(_pos);
+}
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+        __releases(&fscache_object_list_lock)
+{
+        read_unlock(&fscache_object_list_lock);
+}
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+        struct fscache_objlist_data *data = m->private;
+        struct fscache_object *obj = v;
+        unsigned long config = data->config;
+        uint16_t keylen, auxlen;
+        char _type[3], *type;
+        bool no_cookie;
+        u8 *buf = data->buf, *p;
+        if ((unsigned long) v == 1) {
+                seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
+                         " EM EV F S"
+                         " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+                              FSCACHE_OBJLIST_CONFIG_AUX))
+                        seq_puts(m, "       ");
+                if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+                        seq_puts(m, "OBJECT_KEY");
+                if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+                               FSCACHE_OBJLIST_CONFIG_AUX)) ==
+                    (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+                        seq_puts(m, ", ");
+                if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+                        seq_puts(m, "AUX_DATA");
+                seq_puts(m, "\n");
+                return 0;
+        }
+        if ((unsigned long) v == 2) {
+                seq_puts(m, "======== ======== ==== ===== === === === == ====="
+                         " == == = ="
+                         " | ================ == == ================");
+                if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+                              FSCACHE_OBJLIST_CONFIG_AUX))
+                        seq_puts(m, " ================");
+                seq_puts(m, "\n");
+                return 0;
+        }
+        /* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no)                                    \
+        do {                                                            \
+                unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes;      \
+                unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no;        \
+                if (criterion) {                                        \
+                        if (!(config & yes))                            \
+                                return 0;                               \
+                } else {                                                \
+                        if (!(config & no))                             \
+                                return 0;                               \
+                }                                                       \
+        } while(0)
+        if (~config) {
+                FILTER(obj->cookie,
+                       COOKIE, NOCOOKIE);
+                FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+                       obj->n_ops != 0 ||
+                       obj->n_obj_ops != 0 ||
+                       obj->flags ||
+                       !list_empty(&obj->dependents),
+                       BUSY, IDLE);
+                FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+                       PENDWR, NOPENDWR);
+                FILTER(atomic_read(&obj->n_reads),
+                       READS, NOREADS);
+                FILTER(obj->events & obj->event_mask,
+                       EVENTS, NOEVENTS);
+                FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+                       WORK, NOWORK);
+        }
+        seq_printf(m,
+                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+                   obj->debug_id,
+                   obj->parent ? obj->parent->debug_id : -1,
+                   fscache_object_states_short[obj->state],
+                   obj->n_children,
+                   obj->n_ops,
+                   obj->n_obj_ops,
+                   obj->n_in_progress,
+                   obj->n_exclusive,
+                   atomic_read(&obj->n_reads),
+                   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+                   obj->events,
+                   obj->flags,
+                   obj->work.flags);
+        no_cookie = true;
+        keylen = auxlen = 0;
+        if (obj->cookie) {
+                spin_lock(&obj->lock);
+                if (obj->cookie) {
+                        switch (obj->cookie->def->type) {
+                        case 0:
+                                type = "IX";
+                                break;
+                        case 1:
+                                type = "DT";
+                                break;
+                        default:
+                                sprintf(_type, "%02u",
+                                        obj->cookie->def->type);
+                                type = _type;
+                                break;
+                        }
+                        seq_printf(m, "%-16s %s %2lx %16p",
+                                   obj->cookie->def->name,
+                                   type,
+                                   obj->cookie->flags,
+                                   obj->cookie->netfs_data);
+                        if (obj->cookie->def->get_key &&
+                            config & FSCACHE_OBJLIST_CONFIG_KEY)
+                                keylen = obj->cookie->def->get_key(
+                                        obj->cookie->netfs_data,
+                                        buf, 400);
+                        if (obj->cookie->def->get_aux &&
+                            config & FSCACHE_OBJLIST_CONFIG_AUX)
+                                auxlen = obj->cookie->def->get_aux(
+                                        obj->cookie->netfs_data,
+                                        buf + keylen, 512 - keylen);
+                        no_cookie = false;
+                }
+                spin_unlock(&obj->lock);
+                if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+                        seq_printf(m, " ");
+                        for (p = buf; keylen > 0; keylen--)
+                                seq_printf(m, "%02x", *p++);
+                        if (auxlen > 0) {
+                                if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+                                        seq_printf(m, ", ");
+                                for (; auxlen > 0; auxlen--)
+                                        seq_printf(m, "%02x", *p++);
+                        }
+                }
+        }
+        if (no_cookie)
+                seq_printf(m, "<no_cookie>\n");
+        else
+                seq_printf(m, "\n");
+        return 0;
+}
+static const struct seq_operations fscache_objlist_ops = {
+        .start          = fscache_objlist_start,
+        .stop           = fscache_objlist_stop,
+        .next           = fscache_objlist_next,
+        .show           = fscache_objlist_show,
+};
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+        struct user_key_payload *confkey;
+        unsigned long config;
+        struct key *key;
+        const char *buf;
+        int len;
+        key = request_key(&key_type_user, "fscache:objlist", NULL);
+        if (IS_ERR(key))
+                goto no_config;
+        config = 0;
+        rcu_read_lock();
+        confkey = key->payload.data;
+        buf = confkey->data;
+        for (len = confkey->datalen - 1; len >= 0; len--) {
+                switch (buf[len]) {
+                case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY;         break;
+                case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX;         break;
+                case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE;      break;
+                case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE;    break;
+                case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY;        break;
+                case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE;        break;
+                case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR;      break;
+                case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR;    break;
+                case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS;       break;
+                case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS;     break;
+                case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK;        break;
+                case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK;      break;
+                }
+        }
+        rcu_read_unlock();
+        key_put(key);
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+            config   |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+            config   |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+            config   |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+            config   |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+            config   |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+        if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+            config   |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+        data->config = config;
+        return;
+no_config:
+#endif
+        data->config = ULONG_MAX;
+}
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+        struct fscache_objlist_data *data;
+        struct seq_file *m;
+        int ret;
+        ret = seq_open(file, &fscache_objlist_ops);
+        if (ret < 0)
+                return ret;
+        m = file->private_data;
+        /* buffer for key extraction */
+        data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
+        if (!data) {
+                seq_release(inode, file);
+                return -ENOMEM;
+        }
+        /* get the configuration key */
+        fscache_objlist_config(data);
+        m->private = data;
+        return 0;
+}
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *m = file->private_data;
+        kfree(m->private);
+        m->private = NULL;
+        return seq_release(inode, file);
+}
+const struct file_operations fscache_objlist_fops = {
+        .owner          = THIS_MODULE,
+        .open           = fscache_objlist_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = fscache_objlist_release,
+};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79d..e513ac599c8e 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,9 +14,10 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
-const char *fscache_object_states[] = {
+const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
        [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
        [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
        [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
@@ -33,9 +34,28 @@ const char *fscache_object_states[] = {
 };
 EXPORT_SYMBOL(fscache_object_states);
+const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+        [FSCACHE_OBJECT_INIT]           = "INIT",
+        [FSCACHE_OBJECT_LOOKING_UP]     = "LOOK",
+        [FSCACHE_OBJECT_CREATING]       = "CRTN",
+        [FSCACHE_OBJECT_AVAILABLE]      = "AVBL",
+        [FSCACHE_OBJECT_ACTIVE]         = "ACTV",
+        [FSCACHE_OBJECT_UPDATING]       = "UPDT",
+        [FSCACHE_OBJECT_DYING]          = "DYNG",
+        [FSCACHE_OBJECT_LC_DYING]       = "LCDY",
+        [FSCACHE_OBJECT_ABORT_INIT]     = "ABTI",
+        [FSCACHE_OBJECT_RELEASING]      = "RELS",
+        [FSCACHE_OBJECT_RECYCLING]      = "RCYC",
+        [FSCACHE_OBJECT_WITHDRAWING]    = "WTHD",
+        [FSCACHE_OBJECT_DEAD]           = "DEAD",
+};
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
+#endif
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -45,9 +65,13 @@ static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 const struct slow_work_ops fscache_object_slow_work_ops = {
+        .owner          = THIS_MODULE,
        .get_ref        = fscache_object_slow_work_get_ref,
        .put_ref        = fscache_object_slow_work_put_ref,
        .execute        = fscache_object_slow_work_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+        .desc           = fscache_object_slow_work_desc,
+#endif
 };
 EXPORT_SYMBOL(fscache_object_slow_work_ops);
@@ -81,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
 static void fscache_object_state_machine(struct fscache_object *object)
 {
        enum fscache_object_state new_state;
+        struct fscache_cookie *cookie;
        ASSERT(object != NULL);
@@ -120,20 +145,31 @@ static void fscache_object_state_machine(struct fscache_object *object)
        case FSCACHE_OBJECT_UPDATING:
                clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
                fscache_stat(&fscache_n_updates_run);
+                fscache_stat(&fscache_n_cop_update_object);
                object->cache->ops->update_object(object);
+                fscache_stat_d(&fscache_n_cop_update_object);
                goto active_transit;
                /* handle an object dying during lookup or creation */
        case FSCACHE_OBJECT_LC_DYING:
                object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+                fscache_stat(&fscache_n_cop_lookup_complete);
                object->cache->ops->lookup_complete(object);
+                fscache_stat_d(&fscache_n_cop_lookup_complete);
                spin_lock(&object->lock);
                object->state = FSCACHE_OBJECT_DYING;
-                if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                cookie = object->cookie;
-                                       &object->cookie->flags))
+                if (cookie) {
-                        wake_up_bit(&object->cookie->flags,
+                        if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
-                                    FSCACHE_COOKIE_CREATING);
+                                               &cookie->flags))
+                                wake_up_bit(&cookie->flags,
+                                            FSCACHE_COOKIE_LOOKING_UP);
+                        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                               &cookie->flags))
+                                wake_up_bit(&cookie->flags,
+                                            FSCACHE_COOKIE_CREATING);
+                }
                spin_unlock(&object->lock);
                fscache_done_parent_op(object);
@@ -165,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
                }
                spin_unlock(&object->lock);
                fscache_enqueue_dependents(object);
+                fscache_start_operations(object);
                goto terminal_transit;
                /* handle an abort during initialisation */
@@ -316,14 +353,29 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
        _enter("{OBJ%x}", object->debug_id);
-        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
        start = jiffies;
        fscache_object_state_machine(object);
        fscache_hist(fscache_objs_histogram, start);
        if (object->events & object->event_mask)
                fscache_enqueue_object(object);
+        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+}
+/*
+ * describe an object for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *work,
+                                          struct seq_file *m)
+{
+        struct fscache_object *object =
+                container_of(work, struct fscache_object, work);
+        seq_printf(m, "FSC: OBJ%x: %s",
+                   object->debug_id,
+                   fscache_object_states_short[object->state]);
 }
+#endif
 /*
 * initialise an object
@@ -376,7 +428,9 @@ static void fscache_initialise_object(struct fscache_object *object)
                         * binding on to us, so we need to make sure we don't
                         * add ourself to the list multiple times */
                        if (list_empty(&object->dep_link)) {
+                                fscache_stat(&fscache_n_cop_grab_object);
                                object->cache->ops->grab_object(object);
+                                fscache_stat_d(&fscache_n_cop_grab_object);
                                list_add(&object->dep_link,
                                         &parent->dependents);
@@ -414,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object)
 {
        struct fscache_cookie *cookie = object->cookie;
        struct fscache_object *parent;
+        int ret;
        _enter("");
@@ -438,11 +493,20 @@ static void fscache_lookup_object(struct fscache_object *object)
               object->cache->tag->name);
        fscache_stat(&fscache_n_object_lookups);
-        object->cache->ops->lookup_object(object);
+        fscache_stat(&fscache_n_cop_lookup_object);
+        ret = object->cache->ops->lookup_object(object);
+        fscache_stat_d(&fscache_n_cop_lookup_object);
        if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
                set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+        if (ret == -ETIMEDOUT) {
+                /* probably stuck behind another object, so move this one to
+                 * the back of the queue */
+                fscache_stat(&fscache_n_object_lookups_timed_out);
+                set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        }
        _leave("");
 }
@@ -546,7 +610,8 @@ static void fscache_object_available(struct fscache_object *object)
        spin_lock(&object->lock);
-        if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+        if (object->cookie &&
+            test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
                wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
        fscache_done_parent_op(object);
@@ -562,7 +627,9 @@ static void fscache_object_available(struct fscache_object *object)
        }
        spin_unlock(&object->lock);
+        fscache_stat(&fscache_n_cop_lookup_complete);
        object->cache->ops->lookup_complete(object);
+        fscache_stat_d(&fscache_n_cop_lookup_complete);
        fscache_enqueue_dependents(object);
        fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
@@ -581,11 +648,16 @@ static void fscache_drop_object(struct fscache_object *object)
        _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+        ASSERTCMP(object->cookie, ==, NULL);
+        ASSERT(hlist_unhashed(&object->cookie_link));
        spin_lock(&cache->object_list_lock);
        list_del_init(&object->cache_link);
        spin_unlock(&cache->object_list_lock);
+        fscache_stat(&fscache_n_cop_drop_object);
        cache->ops->drop_object(object);
+        fscache_stat_d(&fscache_n_cop_drop_object);
        if (parent) {
                _debug("release parent OBJ%x {%d}",
@@ -600,7 +672,9 @@ static void fscache_drop_object(struct fscache_object *object)
        }
        /* this just shifts the object release to the slow work processor */
+        fscache_stat(&fscache_n_cop_put_object);
        object->cache->ops->put_object(object);
+        fscache_stat_d(&fscache_n_cop_put_object);
        _leave("");
 }
@@ -690,8 +764,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 {
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
+        int ret;
-        return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+        fscache_stat(&fscache_n_cop_grab_object);
+        ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+        fscache_stat_d(&fscache_n_cop_grab_object);
+        return ret;
 }
 /*
@@ -702,7 +780,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work)
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
-        return object->cache->ops->put_object(object);
+        fscache_stat(&fscache_n_cop_put_object);
+        object->cache->ops->put_object(object);
+        fscache_stat_d(&fscache_n_cop_put_object);
 }
 /*
@@ -739,7 +819,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
                /* sort onto appropriate lists */
                fscache_enqueue_object(dep);
+                fscache_stat(&fscache_n_cop_put_object);
                dep->cache->ops->put_object(dep);
+                fscache_stat_d(&fscache_n_cop_put_object);
                if (!list_empty(&object->dependents))
                        cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6b..313e79a14266 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -13,6 +13,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 atomic_t fscache_op_debug_id;
@@ -31,32 +32,33 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        _enter("{OBJ%x OP%x,%u}",
               op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+        fscache_set_op_state(op, "EnQ");
+        ASSERT(list_empty(&op->pend_link));
        ASSERT(op->processor != NULL);
        ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
        ASSERTCMP(atomic_read(&op->usage), >, 0);
-        if (list_empty(&op->pend_link)) {
+        fscache_stat(&fscache_n_op_enqueue);
-                switch (op->flags & FSCACHE_OP_TYPE) {
+        switch (op->flags & FSCACHE_OP_TYPE) {
-                case FSCACHE_OP_FAST:
+        case FSCACHE_OP_FAST:
-                        _debug("queue fast");
+                _debug("queue fast");
-                        atomic_inc(&op->usage);
+                atomic_inc(&op->usage);
-                        if (!schedule_work(&op->fast_work))
+                if (!schedule_work(&op->fast_work))
-                                fscache_put_operation(op);
+                        fscache_put_operation(op);
-                        break;
+                break;
-                case FSCACHE_OP_SLOW:
+        case FSCACHE_OP_SLOW:
-                        _debug("queue slow");
+                _debug("queue slow");
-                        slow_work_enqueue(&op->slow_work);
+                slow_work_enqueue(&op->slow_work);
-                        break;
+                break;
-                case FSCACHE_OP_MYTHREAD:
+        case FSCACHE_OP_MYTHREAD:
-                        _debug("queue for caller's attention");
+                _debug("queue for caller's attention");
-                        break;
+                break;
-                default:
+        default:
-                        printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+                printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
-                               op->flags);
+                       op->flags);
-                        BUG();
+                BUG();
-                        break;
+                break;
-                }
-                fscache_stat(&fscache_n_op_enqueue);
        }
 }
 EXPORT_SYMBOL(fscache_enqueue_operation);
@@ -67,6 +69,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
                           struct fscache_operation *op)
 {
+        fscache_set_op_state(op, "Run");
        object->n_in_progress++;
        if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
                wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -87,9 +91,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
        _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+        fscache_set_op_state(op, "SubmitX");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ASSERT(list_empty(&op->pend_link));
        ret = -ENOBUFS;
        if (fscache_object_is_active(object)) {
@@ -190,9 +197,12 @@ int fscache_submit_op(struct fscache_object *object,
        ASSERTCMP(atomic_read(&op->usage), >, 0);
+        fscache_set_op_state(op, "Submit");
        spin_lock(&object->lock);
        ASSERTCMP(object->n_ops, >=, object->n_in_progress);
        ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+        ASSERT(list_empty(&op->pend_link));
        ostate = object->state;
        smp_rmb();
@@ -222,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object,
                list_add_tail(&op->pend_link, &object->pending_ops);
                fscache_stat(&fscache_n_op_pend);
                ret = 0;
+        } else if (object->state == FSCACHE_OBJECT_DYING ||
+                   object->state == FSCACHE_OBJECT_LC_DYING ||
+                   object->state == FSCACHE_OBJECT_WITHDRAWING) {
+                fscache_stat(&fscache_n_op_rejected);
+                ret = -ENOBUFS;
        } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
                fscache_report_unexpected_submission(object, op, ostate);
                ASSERT(!fscache_object_is_active(object));
@@ -264,12 +279,7 @@ void fscache_start_operations(struct fscache_object *object)
                        stop = true;
                }
                list_del_init(&op->pend_link);
-                object->n_in_progress++;
+                fscache_run_op(object, op);
-                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
-                        wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
-                if (op->processor)
-                        fscache_enqueue_operation(op);
                /* the pending queue was holding a ref on the object */
                fscache_put_operation(op);
@@ -282,6 +292,36 @@ void fscache_start_operations(struct fscache_object *object)
 }
 /*
+ * cancel an operation that's pending on an object
+ */
+int fscache_cancel_op(struct fscache_operation *op)
+{
+        struct fscache_object *object = op->object;
+        int ret;
+        _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+        spin_lock(&object->lock);
+        ret = -EBUSY;
+        if (!list_empty(&op->pend_link)) {
+                fscache_stat(&fscache_n_op_cancelled);
+                list_del_init(&op->pend_link);
+                object->n_ops--;
+                if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+                        object->n_exclusive--;
+                if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                        wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+                fscache_put_operation(op);
+                ret = 0;
+        }
+        spin_unlock(&object->lock);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
 * release an operation
 * - queues pending ops if this is the last in-progress op
 */
@@ -298,6 +338,8 @@ void fscache_put_operation(struct fscache_operation *op)
        if (!atomic_dec_and_test(&op->usage))
                return;
+        fscache_set_op_state(op, "Put");
        _debug("PUT OP");
        if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
                BUG();
@@ -311,6 +353,9 @@ void fscache_put_operation(struct fscache_operation *op)
        object = op->object;
+        if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+                atomic_dec(&object->n_reads);
        /* now... we may get called with the object spinlock held, so we
         * complete the cleanup here only if we can immediately acquire the
         * lock, and defer it otherwise */
@@ -452,8 +497,27 @@ static void fscache_op_execute(struct slow_work *work)
        _leave("");
 }
+/*
+ * describe an operation for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
+{
+        struct fscache_operation *op =
+                container_of(work, struct fscache_operation, slow_work);
+        seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
+                   op->object->debug_id, op->debug_id,
+                   op->name, op->state, op->flags);
+}
+#endif
 const struct slow_work_ops fscache_op_slow_work_ops = {
+        .owner          = THIS_MODULE,
        .get_ref        = fscache_op_get_ref,
        .put_ref        = fscache_op_put_ref,
        .execute        = fscache_op_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+        .desc           = fscache_op_desc,
+#endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 2568e0eb644f..c598ea4c4e7d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,18 +43,102 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 /*
- * note that a page has finished being written to the cache
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
 */
-static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+                                  struct page *page,
+                                  gfp_t gfp)
 {
        struct page *xpage;
+        void *val;
+        _enter("%p,%p,%x", cookie, page, gfp);
+        rcu_read_lock();
+        val = radix_tree_lookup(&cookie->stores, page->index);
+        if (!val) {
+                rcu_read_unlock();
+                fscache_stat(&fscache_n_store_vmscan_not_storing);
+                __fscache_uncache_page(cookie, page);
+                return true;
+        }
+        /* see if the page is actually undergoing storage - if so we can't get
+         * rid of it till the cache has finished with it */
+        if (radix_tree_tag_get(&cookie->stores, page->index,
+                               FSCACHE_COOKIE_STORING_TAG)) {
+                rcu_read_unlock();
+                goto page_busy;
+        }
+        /* the page is pending storage, so we attempt to cancel the store and
+         * discard the store request so that the page can be reclaimed */
+        spin_lock(&cookie->stores_lock);
+        rcu_read_unlock();
+        if (radix_tree_tag_get(&cookie->stores, page->index,
+                               FSCACHE_COOKIE_STORING_TAG)) {
+                /* the page started to undergo storage whilst we were looking,
+                 * so now we can only wait or return */
+                spin_unlock(&cookie->stores_lock);
+                goto page_busy;
+        }
-        spin_lock(&cookie->lock);
        xpage = radix_tree_delete(&cookie->stores, page->index);
-        spin_unlock(&cookie->lock);
+        spin_unlock(&cookie->stores_lock);
-        ASSERT(xpage != NULL);
+        if (xpage) {
+                fscache_stat(&fscache_n_store_vmscan_cancelled);
+                fscache_stat(&fscache_n_store_radix_deletes);
+                ASSERTCMP(xpage, ==, page);
+        } else {
+                fscache_stat(&fscache_n_store_vmscan_gone);
+        }
        wake_up_bit(&cookie->flags, 0);
+        if (xpage)
+                page_cache_release(xpage);
+        __fscache_uncache_page(cookie, page);
+        return true;
+page_busy:
+        /* we might want to wait here, but that could deadlock the allocator as
+         * the slow-work threads writing to the cache may all end up sleeping
+         * on memory allocation */
+        fscache_stat(&fscache_n_store_vmscan_busy);
+        return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_object *object,
+                                   struct page *page)
+{
+        struct fscache_cookie *cookie;
+        struct page *xpage = NULL;
+        spin_lock(&object->lock);
+        cookie = object->cookie;
+        if (cookie) {
+                /* delete the page from the tree if it is now no longer
+                 * pending */
+                spin_lock(&cookie->stores_lock);
+                radix_tree_tag_clear(&cookie->stores, page->index,
+                                     FSCACHE_COOKIE_STORING_TAG);
+                if (!radix_tree_tag_get(&cookie->stores, page->index,
+                                        FSCACHE_COOKIE_PENDING_TAG)) {
+                        fscache_stat(&fscache_n_store_radix_deletes);
+                        xpage = radix_tree_delete(&cookie->stores, page->index);
+                }
+                spin_unlock(&cookie->stores_lock);
+                wake_up_bit(&cookie->flags, 0);
+        }
+        spin_unlock(&object->lock);
+        if (xpage)
+                page_cache_release(xpage);
 }
 /*
@@ -63,14 +147,21 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p
 static void fscache_attr_changed_op(struct fscache_operation *op)
 {
        struct fscache_object *object = op->object;
+        int ret;
        _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
        fscache_stat(&fscache_n_attr_changed_calls);
-        if (fscache_object_is_active(object) &&
+        if (fscache_object_is_active(object)) {
-            object->cache->ops->attr_changed(object) < 0)
+                fscache_set_op_state(op, "CallFS");
-                fscache_abort_object(object);
+                fscache_stat(&fscache_n_cop_attr_changed);
+                ret = object->cache->ops->attr_changed(object);
+                fscache_stat_d(&fscache_n_cop_attr_changed);
+                fscache_set_op_state(op, "Done");
+                if (ret < 0)
+                        fscache_abort_object(object);
+        }
        _leave("");
 }
@@ -99,6 +190,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
        fscache_operation_init(op, NULL);
        fscache_operation_init_slow(op, fscache_attr_changed_op);
        op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+        fscache_set_op_name(op, "Attr");
        spin_lock(&cookie->lock);
@@ -184,6 +276,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
        op->start_time  = jiffies;
        INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
        INIT_LIST_HEAD(&op->to_do);
+        fscache_set_op_name(&op->op, "Retr");
        return op;
 }
@@ -221,6 +314,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 }
 /*
+ * wait for an object to become active (or dead)
+ */
+static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
+                                                 struct fscache_retrieval *op,
+                                                 atomic_t *stat_op_waits,
+                                                 atomic_t *stat_object_dead)
+{
+        int ret;
+        if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+                goto check_if_dead;
+        _debug(">>> WT");
+        fscache_stat(stat_op_waits);
+        if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                        fscache_wait_bit_interruptible,
+                        TASK_INTERRUPTIBLE) < 0) {
+                ret = fscache_cancel_op(&op->op);
+                if (ret == 0)
+                        return -ERESTARTSYS;
+                /* it's been removed from the pending queue by another party,
+                 * so we should get to run shortly */
+                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        }
+        _debug("<<< GO");
+check_if_dead:
+        if (unlikely(fscache_object_is_dead(object))) {
+                fscache_stat(stat_object_dead);
+                return -ENOBUFS;
+        }
+        return 0;
+}
+/*
 * read a page from the cache or allocate a block in which to store it
 * - we return:
 *   -ENOMEM    - out of memory, nothing done
@@ -257,6 +387,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
                _leave(" = -ENOMEM");
                return -ENOMEM;
        }
+        fscache_set_op_name(&op->op, "RetrRA1");
        spin_lock(&cookie->lock);
@@ -267,6 +398,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+        atomic_inc(&object->n_reads);
+        set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
        if (fscache_submit_op(object, &op->op) < 0)
                goto nobufs_unlock;
        spin_unlock(&cookie->lock);
@@ -279,23 +413,27 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
        /* we wait for the operation to become active, and then process it
         * *here*, in this thread, and not in the thread pool */
-        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+        ret = fscache_wait_for_retrieval_activation(
-                _debug(">>> WT");
+                object, op,
-                fscache_stat(&fscache_n_retrieval_op_waits);
+                __fscache_stat(&fscache_n_retrieval_op_waits),
-                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                __fscache_stat(&fscache_n_retrievals_object_dead));
-                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        if (ret < 0)
-                _debug("<<< GO");
+                goto error;
-        }
        /* ask the cache to honour the operation */
        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+                fscache_stat(&fscache_n_cop_allocate_page);
                ret = object->cache->ops->allocate_page(op, page, gfp);
+                fscache_stat_d(&fscache_n_cop_allocate_page);
                if (ret == 0)
                        ret = -ENODATA;
        } else {
+                fscache_stat(&fscache_n_cop_read_or_alloc_page);
                ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+                fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
        }
+error:
        if (ret == -ENOMEM)
                fscache_stat(&fscache_n_retrievals_nomem);
        else if (ret == -ERESTARTSYS)
@@ -347,7 +485,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
                                  void *context,
                                  gfp_t gfp)
 {
-        fscache_pages_retrieval_func_t func;
        struct fscache_retrieval *op;
        struct fscache_object *object;
        int ret;
@@ -369,6 +506,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(mapping, end_io_func, context);
        if (!op)
                return -ENOMEM;
+        fscache_set_op_name(&op->op, "RetrRAN");
        spin_lock(&cookie->lock);
@@ -377,6 +515,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        object = hlist_entry(cookie->backing_objects.first,
                             struct fscache_object, cookie_link);
+        atomic_inc(&object->n_reads);
+        set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
        if (fscache_submit_op(object, &op->op) < 0)
                goto nobufs_unlock;
        spin_unlock(&cookie->lock);
@@ -389,21 +530,27 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
        /* we wait for the operation to become active, and then process it
         * *here*, in this thread, and not in the thread pool */
-        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+        ret = fscache_wait_for_retrieval_activation(
-                _debug(">>> WT");
+                object, op,
-                fscache_stat(&fscache_n_retrieval_op_waits);
+                __fscache_stat(&fscache_n_retrieval_op_waits),
-                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                __fscache_stat(&fscache_n_retrievals_object_dead));
-                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        if (ret < 0)
-                _debug("<<< GO");
+                goto error;
-        }
        /* ask the cache to honour the operation */
-        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+        if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
-                func = object->cache->ops->allocate_pages;
+                fscache_stat(&fscache_n_cop_allocate_pages);
-        else
+                ret = object->cache->ops->allocate_pages(
-                func = object->cache->ops->read_or_alloc_pages;
+                        op, pages, nr_pages, gfp);
-        ret = func(op, pages, nr_pages, gfp);
+                fscache_stat_d(&fscache_n_cop_allocate_pages);
+        } else {
+                fscache_stat(&fscache_n_cop_read_or_alloc_pages);
+                ret = object->cache->ops->read_or_alloc_pages(
+                        op, pages, nr_pages, gfp);
+                fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
+        }
+error:
        if (ret == -ENOMEM)
                fscache_stat(&fscache_n_retrievals_nomem);
        else if (ret == -ERESTARTSYS)
@@ -461,6 +608,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
        if (!op)
                return -ENOMEM;
+        fscache_set_op_name(&op->op, "RetrAL1");
        spin_lock(&cookie->lock);
@@ -475,18 +623,22 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
        fscache_stat(&fscache_n_alloc_ops);
-        if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+        ret = fscache_wait_for_retrieval_activation(
-                _debug(">>> WT");
+                object, op,
-                fscache_stat(&fscache_n_alloc_op_waits);
+                __fscache_stat(&fscache_n_alloc_op_waits),
-                wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                __fscache_stat(&fscache_n_allocs_object_dead));
-                            fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+        if (ret < 0)
-                _debug("<<< GO");
+                goto error;
-        }
        /* ask the cache to honour the operation */
+        fscache_stat(&fscache_n_cop_allocate_page);
        ret = object->cache->ops->allocate_page(op, page, gfp);
+        fscache_stat_d(&fscache_n_cop_allocate_page);
-        if (ret < 0)
+error:
+        if (ret == -ERESTARTSYS)
+                fscache_stat(&fscache_n_allocs_intr);
+        else if (ret < 0)
                fscache_stat(&fscache_n_allocs_nobufs);
        else
                fscache_stat(&fscache_n_allocs_ok);
@@ -521,7 +673,7 @@ static void fscache_write_op(struct fscache_operation *_op)
        struct fscache_storage *op =
                container_of(_op, struct fscache_storage, op);
        struct fscache_object *object = op->op.object;
-        struct fscache_cookie *cookie = object->cookie;
+        struct fscache_cookie *cookie;
        struct page *page;
        unsigned n;
        void *results[1];
@@ -529,16 +681,19 @@ static void fscache_write_op(struct fscache_operation *_op)
        _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
-        spin_lock(&cookie->lock);
+        fscache_set_op_state(&op->op, "GetPage");
        spin_lock(&object->lock);
+        cookie = object->cookie;
-        if (!fscache_object_is_active(object)) {
+        if (!fscache_object_is_active(object) || !cookie) {
                spin_unlock(&object->lock);
-                spin_unlock(&cookie->lock);
                _leave("");
                return;
        }
+        spin_lock(&cookie->stores_lock);
        fscache_stat(&fscache_n_store_calls);
        /* find a page to store */
@@ -549,23 +704,35 @@ static void fscache_write_op(struct fscache_operation *_op)
                goto superseded;
        page = results[0];
        _debug("gang %d [%lx]", n, page->index);
-        if (page->index > op->store_limit)
+        if (page->index > op->store_limit) {
+                fscache_stat(&fscache_n_store_pages_over_limit);
                goto superseded;
+        }
-        radix_tree_tag_clear(&cookie->stores, page->index,
+        if (page) {
-                             FSCACHE_COOKIE_PENDING_TAG);
+                radix_tree_tag_set(&cookie->stores, page->index,
+                                   FSCACHE_COOKIE_STORING_TAG);
+                radix_tree_tag_clear(&cookie->stores, page->index,
+                                     FSCACHE_COOKIE_PENDING_TAG);
+        }
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        spin_unlock(&cookie->lock);
        if (page) {
+                fscache_set_op_state(&op->op, "Store");
+                fscache_stat(&fscache_n_store_pages);
+                fscache_stat(&fscache_n_cop_write_page);
                ret = object->cache->ops->write_page(op, page);
-                fscache_end_page_write(cookie, page);
+                fscache_stat_d(&fscache_n_cop_write_page);
-                page_cache_release(page);
+                fscache_set_op_state(&op->op, "EndWrite");
-                if (ret < 0)
+                fscache_end_page_write(object, page);
+                if (ret < 0) {
+                        fscache_set_op_state(&op->op, "Abort");
                        fscache_abort_object(object);
-                else
+                } else {
                        fscache_enqueue_operation(&op->op);
+                }
        }
        _leave("");
@@ -575,9 +742,9 @@ superseded:
        /* this writer is going away and there aren't any more things to
         * write */
        _debug("cease");
+        spin_unlock(&cookie->stores_lock);
        clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
        spin_unlock(&object->lock);
-        spin_unlock(&cookie->lock);
        _leave("");
 }
@@ -634,6 +801,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_operation_init(&op->op, fscache_release_write_op);
        fscache_operation_init_slow(&op->op, fscache_write_op);
        op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+        fscache_set_op_name(&op->op, "Write1");
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
        if (ret < 0)
@@ -652,6 +820,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        /* add the page to the pending-storage radix tree on the backing
         * object */
        spin_lock(&object->lock);
+        spin_lock(&cookie->stores_lock);
        _debug("store limit %llx", (unsigned long long) object->store_limit);
@@ -672,6 +841,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
                goto already_pending;
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
        op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
@@ -693,6 +863,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 already_queued:
        fscache_stat(&fscache_n_stores_again);
 already_pending:
+        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
        spin_unlock(&cookie->lock);
        radix_tree_preload_end();
@@ -702,7 +873,9 @@ already_pending:
        return 0;
 submit_failed:
+        spin_lock(&cookie->stores_lock);
        radix_tree_delete(&cookie->stores, page->index);
+        spin_unlock(&cookie->stores_lock);
        page_cache_release(page);
        ret = -ENOBUFS;
        goto nobufs;
@@ -763,7 +936,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
        if (TestClearPageFsCache(page) &&
            object->cache->ops->uncache_page) {
                /* the cache backend releases the cookie lock */
+                fscache_stat(&fscache_n_cop_uncache_page);
                object->cache->ops->uncache_page(object, page);
+                fscache_stat_d(&fscache_n_cop_uncache_page);
                goto done;
        }
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index beeab44bc31a..1d9e4951a597 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -37,10 +37,20 @@ int __init fscache_proc_init(void)
                goto error_histogram;
 #endif
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+        if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+                         &fscache_objlist_fops))
+                goto error_objects;
+#endif
        _leave(" = 0");
        return 0;
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
+        remove_proc_entry("fs/fscache/histogram", NULL);
 error_histogram:
 #endif
 #ifdef CONFIG_FSCACHE_STATS
@@ -58,6 +68,9 @@ error_dir:
 */
 void fscache_proc_cleanup(void)
 {
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+        remove_proc_entry("fs/fscache/objects", NULL);
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
        remove_proc_entry("fs/fscache/histogram", NULL);
 #endif
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 65deb99e756b..46435f3aae68 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -25,6 +25,8 @@ atomic_t fscache_n_op_requeue;
 atomic_t fscache_n_op_deferred_release;
 atomic_t fscache_n_op_release;
 atomic_t fscache_n_op_gc;
+atomic_t fscache_n_op_cancelled;
+atomic_t fscache_n_op_rejected;
 atomic_t fscache_n_attr_changed;
 atomic_t fscache_n_attr_changed_ok;
@@ -36,6 +38,8 @@ atomic_t fscache_n_allocs;
 atomic_t fscache_n_allocs_ok;
 atomic_t fscache_n_allocs_wait;
 atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
 atomic_t fscache_n_alloc_ops;
 atomic_t fscache_n_alloc_op_waits;
@@ -46,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata;
 atomic_t fscache_n_retrievals_nobufs;
 atomic_t fscache_n_retrievals_intr;
 atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
 atomic_t fscache_n_retrieval_ops;
 atomic_t fscache_n_retrieval_op_waits;
@@ -56,6 +61,14 @@ atomic_t fscache_n_stores_nobufs;
 atomic_t fscache_n_stores_oom;
 atomic_t fscache_n_store_ops;
 atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
@@ -74,6 +87,7 @@ atomic_t fscache_n_updates_run;
 atomic_t fscache_n_relinquishes;
 atomic_t fscache_n_relinquishes_null;
 atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_relinquishes_retire;
 atomic_t fscache_n_cookie_index;
 atomic_t fscache_n_cookie_data;
@@ -84,6 +98,7 @@ atomic_t fscache_n_object_no_alloc;
 atomic_t fscache_n_object_lookups;
 atomic_t fscache_n_object_lookups_negative;
 atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
 atomic_t fscache_n_object_created;
 atomic_t fscache_n_object_avail;
 atomic_t fscache_n_object_dead;
@@ -93,6 +108,23 @@ atomic_t fscache_n_checkaux_okay;
 atomic_t fscache_n_checkaux_update;
 atomic_t fscache_n_checkaux_obsolete;
+atomic_t fscache_n_cop_alloc_object;
+atomic_t fscache_n_cop_lookup_object;
+atomic_t fscache_n_cop_lookup_complete;
+atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_update_object;
+atomic_t fscache_n_cop_drop_object;
+atomic_t fscache_n_cop_put_object;
+atomic_t fscache_n_cop_sync_cache;
+atomic_t fscache_n_cop_attr_changed;
+atomic_t fscache_n_cop_read_or_alloc_page;
+atomic_t fscache_n_cop_read_or_alloc_pages;
+atomic_t fscache_n_cop_allocate_page;
+atomic_t fscache_n_cop_allocate_pages;
+atomic_t fscache_n_cop_write_page;
+atomic_t fscache_n_cop_uncache_page;
+atomic_t fscache_n_cop_dissociate_pages;
 /*
 * display the general statistics
 */
@@ -129,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_acquires_nobufs),
                   atomic_read(&fscache_n_acquires_oom));
-        seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+        seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
                   atomic_read(&fscache_n_object_lookups),
                   atomic_read(&fscache_n_object_lookups_negative),
                   atomic_read(&fscache_n_object_lookups_positive),
+                   atomic_read(&fscache_n_object_lookups_timed_out),
                   atomic_read(&fscache_n_object_created));
        seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
@@ -140,10 +173,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_updates_null),
                   atomic_read(&fscache_n_updates_run));
-        seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+        seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
                   atomic_read(&fscache_n_relinquishes),
                   atomic_read(&fscache_n_relinquishes_null),
-                   atomic_read(&fscache_n_relinquishes_waitcrt));
+                   atomic_read(&fscache_n_relinquishes_waitcrt),
+                   atomic_read(&fscache_n_relinquishes_retire));
        seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
                   atomic_read(&fscache_n_attr_changed),
@@ -152,14 +186,16 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_attr_changed_nomem),
                   atomic_read(&fscache_n_attr_changed_calls));
-        seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+        seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
                   atomic_read(&fscache_n_allocs),
                   atomic_read(&fscache_n_allocs_ok),
                   atomic_read(&fscache_n_allocs_wait),
-                   atomic_read(&fscache_n_allocs_nobufs));
+                   atomic_read(&fscache_n_allocs_nobufs),
-        seq_printf(m, "Allocs : ops=%u owt=%u\n",
+                   atomic_read(&fscache_n_allocs_intr));
+        seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
                   atomic_read(&fscache_n_alloc_ops),
-                   atomic_read(&fscache_n_alloc_op_waits));
+                   atomic_read(&fscache_n_alloc_op_waits),
+                   atomic_read(&fscache_n_allocs_object_dead));
        seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
                   " int=%u oom=%u\n",
@@ -170,9 +206,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_retrievals_nobufs),
                   atomic_read(&fscache_n_retrievals_intr),
                   atomic_read(&fscache_n_retrievals_nomem));
-        seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+        seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
                   atomic_read(&fscache_n_retrieval_ops),
-                   atomic_read(&fscache_n_retrieval_op_waits));
+                   atomic_read(&fscache_n_retrieval_op_waits),
+                   atomic_read(&fscache_n_retrievals_object_dead));
        seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
                   atomic_read(&fscache_n_stores),
@@ -180,18 +217,49 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_stores_again),
                   atomic_read(&fscache_n_stores_nobufs),
                   atomic_read(&fscache_n_stores_oom));
-        seq_printf(m, "Stores : ops=%u run=%u\n",
+        seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
                   atomic_read(&fscache_n_store_ops),
-                   atomic_read(&fscache_n_store_calls));
+                   atomic_read(&fscache_n_store_calls),
+                   atomic_read(&fscache_n_store_pages),
+                   atomic_read(&fscache_n_store_radix_deletes),
+                   atomic_read(&fscache_n_store_pages_over_limit));
-        seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+        seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+                   atomic_read(&fscache_n_store_vmscan_not_storing),
+                   atomic_read(&fscache_n_store_vmscan_gone),
+                   atomic_read(&fscache_n_store_vmscan_busy),
+                   atomic_read(&fscache_n_store_vmscan_cancelled));
+        seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
                   atomic_read(&fscache_n_op_pend),
                   atomic_read(&fscache_n_op_run),
-                   atomic_read(&fscache_n_op_enqueue));
+                   atomic_read(&fscache_n_op_enqueue),
+                   atomic_read(&fscache_n_op_cancelled),
+                   atomic_read(&fscache_n_op_rejected));
        seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
                   atomic_read(&fscache_n_op_deferred_release),
                   atomic_read(&fscache_n_op_release),
                   atomic_read(&fscache_n_op_gc));
+        seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
+                   atomic_read(&fscache_n_cop_alloc_object),
+                   atomic_read(&fscache_n_cop_lookup_object),
+                   atomic_read(&fscache_n_cop_lookup_complete),
+                   atomic_read(&fscache_n_cop_grab_object));
+        seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+                   atomic_read(&fscache_n_cop_update_object),
+                   atomic_read(&fscache_n_cop_drop_object),
+                   atomic_read(&fscache_n_cop_put_object),
+                   atomic_read(&fscache_n_cop_attr_changed),
+                   atomic_read(&fscache_n_cop_sync_cache));
+        seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
+                   atomic_read(&fscache_n_cop_read_or_alloc_page),
+                   atomic_read(&fscache_n_cop_read_or_alloc_pages),
+                   atomic_read(&fscache_n_cop_allocate_page),
+                   atomic_read(&fscache_n_cop_allocate_pages),
+                   atomic_read(&fscache_n_cop_write_page),
+                   atomic_read(&fscache_n_cop_uncache_page),
+                   atomic_read(&fscache_n_cop_dissociate_pages));
        return 0;
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8ada78aade58..4787ae6c5c1c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (fc->no_create)
                return -ENOSYS;
+        if (flags & O_DIRECT)
+                return -EINVAL;
        forget_req = fuse_get_req(fc);
        if (IS_ERR(forget_req))
                return PTR_ERR(forget_req);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d2090..4dcddf83326f 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,8 @@ config GFS2_FS
        select FS_POSIX_ACL
        select CRC32
        select SLOW_WORK
+        select QUOTA
+        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d84..3eb1ea846173 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
 #include "trans.h"
 #include "util.h"
-#define ACL_ACCESS 1
+static const char *gfs2_acl_name(int type)
-#define ACL_DEFAULT 0
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er, int *remove, mode_t *mode)
 {
-        struct posix_acl *acl;
+        switch (type) {
-        int error;
+        case ACL_TYPE_ACCESS:
+                return GFS2_POSIX_ACL_ACCESS;
-        error = gfs2_acl_validate_remove(ip, access);
+        case ACL_TYPE_DEFAULT:
-        if (error)
+                return GFS2_POSIX_ACL_DEFAULT;
-                return error;
-        if (!er->er_data)
-                return -EINVAL;
-        acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl) {
-                *remove = 1;
-                return 0;
-        }
-        error = posix_acl_valid(acl);
-        if (error)
-                goto out;
-        if (access) {
-                error = posix_acl_equiv_mode(acl, mode);
-                if (!error)
-                        *remove = 1;
-                else if (error > 0)
-                        error = 0;
        }
+        return NULL;
-out:
-        posix_acl_release(acl);
-        return error;
-}
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
-        if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
-                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(&ip->i_inode))
-                return -EPERM;
-        if (S_ISLNK(ip->i_inode.i_mode))
-                return -EOPNOTSUPP;
-        if (!access && !S_ISDIR(ip->i_inode.i_mode))
-                return -EACCES;
-        return 0;
 }
-static int acl_get(struct gfs2_inode *ip, const char *name,
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
-                   struct posix_acl **acl, struct gfs2_ea_location *el,
-                   char **datap, unsigned int *lenp)
 {
+        struct posix_acl *acl;
+        const char *name;
        char *data;
-        unsigned int len;
+        int len;
-        int error;
-        el->el_bh = NULL;
        if (!ip->i_eattr)
-                return 0;
+                return NULL;
-        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-        if (error)
-                return error;
-        if (!el->el_ea)
-                return 0;
-        if (!GFS2_EA_DATA_LEN(el->el_ea))
-                goto out;
-        len = GFS2_EA_DATA_LEN(el->el_ea);
+        acl = get_cached_acl(&ip->i_inode, type);
-        data = kmalloc(len, GFP_NOFS);
+        if (acl != ACL_NOT_CACHED)
-        error = -ENOMEM;
+                return acl;
-        if (!data)
-                goto out;
-        error = gfs2_ea_get_copy(ip, el, data, len);
+        name = gfs2_acl_name(type);
-        if (error < 0)
+        if (name == NULL)
-                goto out_kfree;
+                return ERR_PTR(-EINVAL);
-        error = 0;
-        if (acl) {
+        len = gfs2_xattr_acl_get(ip, name, &data);
-                *acl = posix_acl_from_xattr(data, len);
+        if (len < 0)
-                if (IS_ERR(*acl))
+                return ERR_PTR(len);
-                        error = PTR_ERR(*acl);
+        if (len == 0)
-        }
+                return NULL;
-out_kfree:
+        acl = posix_acl_from_xattr(data, len);
-        if (error || !datap) {
+        kfree(data);
-                kfree(data);
+        return acl;
-        } else {
-                *datap = data;
-                *lenp = len;
-        }
-out:
-        return error;
 }
 /**
@@ -140,14 +77,12 @@ out:
 int gfs2_check_acl(struct inode *inode, int mask)
 {
-        struct gfs2_ea_location el;
+        struct posix_acl *acl;
-        struct posix_acl *acl = NULL;
        int error;
-        error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (acl) {
                error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error = 0;
-        struct buffer_head *dibh;
-        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (mode != inode->i_mode) {
-        if (error)
+                struct iattr iattr;
-                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+                iattr.ia_valid = ATTR_MODE;
-        if (!error) {
+                iattr.ia_mode = mode;
-                gfs2_assert_withdraw(sdp,
-                                (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
+                error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
-                ip->i_inode.i_mode = mode;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_dinode_out(ip, dibh->b_data);
-                brelse(dibh);
        }
-        gfs2_trans_end(sdp);
+        return error;
+}
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+        int error;
+        int len;
+        char *data;
+        const char *name = gfs2_acl_name(type);
-        return 0;
+        BUG_ON(name == NULL);
+        len = posix_acl_to_xattr(acl, NULL, 0);
+        if (len == 0)
+                return 0;
+        data = kmalloc(len, GFP_NOFS);
+        if (data == NULL)
+                return -ENOMEM;
+        error = posix_acl_to_xattr(acl, data, len);
+        if (error < 0)
+                goto out;
+        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+        if (!error)
+                set_cached_acl(inode, type, acl);
+out:
+        kfree(data);
+        return error;
 }
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 {
-        struct gfs2_ea_location el;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        mode_t mode = ip->i_inode.i_mode;
+        mode_t mode = inode->i_mode;
-        char *data = NULL;
+        int error = 0;
-        unsigned int len;
-        int error;
        if (!sdp->sd_args.ar_posix_acl)
                return 0;
-        if (S_ISLNK(ip->i_inode.i_mode))
+        if (S_ISLNK(inode->i_mode))
                return 0;
-        error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (!acl) {
                mode &= ~current_umask();
-                if (mode != ip->i_inode.i_mode)
+                if (mode != inode->i_mode)
-                        error = munge_mode(ip, mode);
+                        error = gfs2_set_mode(inode, mode);
                return error;
        }
+        if (S_ISDIR(inode->i_mode)) {
+                error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+                if (error)
+                        goto out;
+        }
        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        posix_acl_release(acl);
        acl = clone;
-        if (S_ISDIR(ip->i_inode.i_mode)) {
-                error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                                       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-                if (error)
-                        goto out;
-        }
        error = posix_acl_create_masq(acl, &mode);
        if (error < 0)
                goto out;
        if (error == 0)
                goto munge;
-        posix_acl_to_xattr(acl, data, len);
+        error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
-        error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                               GFS2_POSIX_ACL_ACCESS, data, len, 0);
        if (error)
                goto out;
 munge:
-        error = munge_mode(ip, mode);
+        error = gfs2_set_mode(inode, mode);
 out:
        posix_acl_release(acl);
-        kfree(data);
        return error;
 }
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        struct gfs2_ea_location el;
        char *data;
        unsigned int len;
        int error;
-        error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
-        if (error)
+        if (IS_ERR(acl))
-                goto out_brelse;
+                return PTR_ERR(acl);
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
@@ -265,15 +207,134 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        error = posix_acl_chmod_masq(acl, attr->ia_mode);
        if (!error) {
+                len = posix_acl_to_xattr(acl, NULL, 0);
+                data = kmalloc(len, GFP_NOFS);
+                error = -ENOMEM;
+                if (data == NULL)
+                        goto out;
                posix_acl_to_xattr(acl, data, len);
-                error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+                error = gfs2_xattr_acl_chmod(ip, attr, data);
+                kfree(data);
+                set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
        }
 out:
        posix_acl_release(acl);
-        kfree(data);
-out_brelse:
-        brelse(el.el_bh);
        return error;
 }
+static int gfs2_acl_type(const char *name)
+{
+        if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+                return ACL_TYPE_ACCESS;
+        if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+                return ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+                                 void *buffer, size_t size)
+{
+        struct posix_acl *acl;
+        int type;
+        int error;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        acl = gfs2_acl_get(GFS2_I(inode), type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+                                 const void *value, size_t size, int flags)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct posix_acl *acl = NULL;
+        int error = 0, type;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!value)
+                goto set_acl;
+        acl = posix_acl_from_xattr(value, size);
+        if (!acl) {
+                /*
+                 * acl_set_file(3) may request that we set default ACLs with
+                 * zero length -- defend (gracefully) against that here.
+                 */
+                goto out;
+        }
+        if (IS_ERR(acl)) {
+                error = PTR_ERR(acl);
+                goto out;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out_release;
+        error = -EINVAL;
+        if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+                goto out_release;
+        if (type == ACL_TYPE_ACCESS) {
+                mode_t mode = inode->i_mode;
+                error = posix_acl_equiv_mode(acl, &mode);
+                if (error <= 0) {
+                        posix_acl_release(acl);
+                        acl = NULL;
+                        if (error < 0)
+                                return error;
+                }
+                error = gfs2_set_mode(inode, mode);
+                if (error)
+                        goto out_release;
+        }
+set_acl:
+        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+        if (!error) {
+                if (acl)
+                        set_cached_acl(inode, type, acl);
+                else
+                        forget_cached_acl(inode, type);
+        }
+out_release:
+        posix_acl_release(acl);
+out:
+        return error;
+}
+struct xattr_handler gfs2_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = gfs2_xattr_system_get,
+        .set    = gfs2_xattr_system_set,
+};
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..9306a2e6620c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
 #include "incore.h"
 #define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN       16
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN      17
+#define GFS2_ACL_MAX_ENTRIES            25
-#define GFS2_ACL_IS_ACCESS(name, len) \
+extern int gfs2_check_acl(struct inode *inode, int mask);
-         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
-         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
-#define GFS2_ACL_IS_DEFAULT(name, len) \
-         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
-         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-struct gfs2_ea_request;
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er,
-                          int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f036..7b8da9415267 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
        unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int i;
        int ret;
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                if (ret || (--(wbc->nr_to_write) <= 0))
                        ret = 1;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        ret = 1;
-                }
        }
        gfs2_trans_end(sdp);
        return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 static int gfs2_write_cache_jdata(struct address_space *mapping,
                                  struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
        int scanned = 0;
        int range_whole = 0;
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..25fddc100f18 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
        return ERR_PTR(-EIO);
 }
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
-                        struct gfs2_dirent **dent)
-{
-        struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-        if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
-                if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_leaf));
-                return IS_LEAF;
-        } else {
-                if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_dinode));
-                return IS_DINODE;
-        }
-}
 static int dirent_check_reclen(struct gfs2_inode *dip,
                               const struct gfs2_dirent *d, const void *end_p)
 {
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        divider = (start + half_len) << (32 - dip->i_depth);
        /*  Copy the entries  */
-        dirent_first(dip, obh, &dent);
+        dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
        do {
                next = dent;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a55..f455a03a09e2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        int rv = 0;
        write_lock(gl_lock_addr(gl->gl_hash));
-        if (atomic_dec_and_test(&gl->gl_ref)) {
+        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
                hlist_del(&gl->gl_list);
-                write_unlock(gl_lock_addr(gl->gl_hash));
-                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
+                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
@@ -513,7 +512,6 @@ retry:
                        GLOCK_BUG_ON(gl, 1);
                }
                spin_unlock(&gl->gl_spin);
-                gfs2_glock_put(gl);
                return;
        }
@@ -524,8 +522,6 @@ retry:
                if (glops->go_xmote_bh) {
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
-                        if (rv == -EAGAIN)
-                                return;
                        spin_lock(&gl->gl_spin);
                        if (rv) {
                                do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
 out_locked:
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
 }
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
        if (!(ret & LM_OUT_ASYNC)) {
                finish_xmote(gl, ret);
-                gfs2_glock_hold(gl);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
        } else {
@@ -672,12 +666,17 @@ out:
        return;
 out_sched:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        smp_mb__after_clear_bit();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put_nolock(gl);
+        return;
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        goto out;
+        smp_mb__after_clear_bit();
+        return;
 }
 static void delete_work_func(struct work_struct *work)
@@ -707,9 +706,12 @@ static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        int drop_ref = 0;
-        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
                finish_xmote(gl, gl->gl_reply);
+                drop_ref = 1;
+        }
        down_read(&gfs2_umount_flush_sem);
        spin_lock(&gl->gl_spin);
        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -727,6 +729,8 @@ static void glock_work_func(struct work_struct *work)
        if (!delay ||
            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
+        if (drop_ref)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1361,10 +1365,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
-                /* Check if glock is about to be freed */
-                if (atomic_read(&gl->gl_ref) == 0)
-                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
@@ -1375,10 +1375,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
                        }
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        smp_mb__after_clear_bit();
                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                                gfs2_glock_put_nolock(gl);
                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
                        continue;
                }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d0..13f0bd228132 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
        return gl->gl_state == LM_ST_SHARED;
 }
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c39..78554acc0605 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
+#include <linux/posix_acl.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
        if (flags & DIO_METADATA) {
                struct address_space *mapping = gl->gl_aspace->i_mapping;
                truncate_inode_pages(mapping, 0);
-                if (ip)
+                if (ip) {
                        set_bit(GIF_INVALID, &ip->i_flags);
+                        forget_all_cached_acls(&ip->i_inode);
+                }
        }
        if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b3..4792200978c8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,7 +429,11 @@ struct gfs2_args {
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
        unsigned int ar_errors:2;               /* errors=withdraw | panic */
+        unsigned int ar_nobarrier:1;            /* do not send barriers */
        int ar_commit;                          /* Commit interval */
+        int ar_statfs_quantum;                  /* The fast statfs interval */
+        int ar_quota_quantum;                   /* The quota interval */
+        int ar_statfs_percent;                  /* The % change to force sync */
 };
 struct gfs2_tune {
@@ -558,6 +562,7 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
+        int sd_statfs_force_sync;
        /* Resource group stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f409..26ba2a4c4a2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -871,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_acl_create(dip, GFS2_I(inode));
+        error = gfs2_acl_create(dip, inode);
        if (error)
                goto fail_gunlock2;
@@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-        str->di_header.__pad0 = 0;
        str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-        str->di_header.__pad1 = 0;
        str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
        str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
        str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..4511b08fc451 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
        lh->lh_flags = cpu_to_be32(flags);
        lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..de97632ba32f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_meta_header *mh;
        struct gfs2_trans *tr;
        lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        gfs2_meta_check(sdp, bd->bd_bh);
        gfs2_pin(sdp, bd->bd_bh);
+        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+        mh->__pad0 = cpu_to_be64(0);
+        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        sdp->sd_log_num_buf++;
        list_add(&le->le_list, &sdp->sd_log_le_buf);
        tr->tr_num_buf_new++;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d082..5b31f7741a8f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
-        error = slow_work_register_user();
+        error = slow_work_register_user(THIS_MODULE);
        if (error)
                goto fail_slow;
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
-        slow_work_unregister_user();
+        slow_work_unregister_user(THIS_MODULE);
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c048981..edfee24f3636 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/slow-work.h>
+#include <linux/quotaops.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -62,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
        gt->gt_quota_scale_den = 1;
-        gt->gt_quota_quantum = 60;
        gt->gt_new_files_jdata = 0;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
-        gt->gt_statfs_quantum = 30;
-        gt->gt_statfs_slow = 0;
 }
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -1114,7 +1112,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
 * Returns: errno
 */
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_holder mount_gh;
@@ -1125,17 +1123,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
                return -ENOMEM;
        }
+        sdp->sd_args = *args;
-        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
-        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
-        sdp->sd_args.ar_commit = 60;
-        sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
-        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
-        if (error) {
-                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                goto fail;
-        }
        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1131,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
+        sb->s_qcop = &gfs2_quotactl_ops;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
        sb->s_time_gran = 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1160,6 +1152,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+        if (sdp->sd_args.ar_statfs_quantum) {
+                sdp->sd_tune.gt_statfs_slow = 0;
+                sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+        }
+        else {
+                sdp->sd_tune.gt_statfs_slow = 1;
+                sdp->sd_tune.gt_statfs_quantum = 30;
+        }
        error = init_names(sdp, silent);
        if (error)
@@ -1243,18 +1244,127 @@ fail:
        return error;
 }
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static int set_gfs2_super(struct super_block *s, void *data)
-                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+        s->s_bdev = data;
+        s->s_dev = s->s_bdev->bd_dev;
+        /*
+         * We set the bdi here to the queue backing, file systems can
+         * overwrite this in ->fill_super()
+         */
+        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+        return 0;
 }
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
 {
        struct block_device *bdev = ptr;
        return (bdev == s->s_bdev);
 }
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct block_device *bdev;
+        struct super_block *s;
+        fmode_t mode = FMODE_READ;
+        int error;
+        struct gfs2_args args;
+        struct gfs2_sbd *sdp;
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        /*
+         * once the super is inserted into the list by sget, s_umount
+         * will protect the lockfs code from trying to start a snapshot
+         * while we are mounting
+         */
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (bdev->bd_fsfreeze_count > 0) {
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                error = -EBUSY;
+                goto error_bdev;
+        }
+        s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+        error = PTR_ERR(s);
+        if (IS_ERR(s))
+                goto error_bdev;
+        memset(&args, 0, sizeof(args));
+        args.ar_quota = GFS2_QUOTA_DEFAULT;
+        args.ar_data = GFS2_DATA_DEFAULT;
+        args.ar_commit = 60;
+        args.ar_statfs_quantum = 30;
+        args.ar_quota_quantum = 60;
+        args.ar_errors = GFS2_ERRORS_DEFAULT;
+        error = gfs2_mount_args(&args, data);
+        if (error) {
+                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                if (s->s_root)
+                        goto error_super;
+                deactivate_locked_super(s);
+                return error;
+        }
+        if (s->s_root) {
+                error = -EBUSY;
+                if ((flags ^ s->s_flags) & MS_RDONLY)
+                        goto error_super;
+                close_bdev_exclusive(bdev, mode);
+        } else {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                s->s_mode = mode;
+                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                sb_set_blocksize(s, block_size(bdev));
+                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+                bdev->bd_super = s;
+        }
+        sdp = s->s_fs_info;
+        mnt->mnt_sb = s;
+        if (args.ar_meta)
+                mnt->mnt_root = dget(sdp->sd_master_dir);
+        else
+                mnt->mnt_root = dget(sdp->sd_root_dir);
+        return 0;
+error_super:
+        deactivate_locked_super(s);
+error_bdev:
+        close_bdev_exclusive(bdev, mode);
+        return error;
+}
 static int set_meta_super(struct super_block *s, void *ptr)
 {
        return -EINVAL;
@@ -1274,13 +1384,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                       dev_name, error);
                return error;
        }
-        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
                return PTR_ERR(s);
        }
+        if ((flags ^ s->s_flags) & MS_RDONLY) {
+                deactivate_locked_super(s);
+                return -EBUSY;
+        }
        sdp = s->s_fs_info;
        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc9..e3bf6eab8750 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
 * fuzziness in the current usage value of IDs that are being used on different
 * nodes in the cluster simultaneously.  So, it is possible for a user on
 * multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
 * program to be run on node crashes or anything like that.
 *
 * There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -65,13 +67,6 @@
 #define QUOTA_USER 1
 #define QUOTA_GROUP 0
-struct gfs2_quota_host {
-        u64 qu_limit;
-        u64 qu_warn;
-        s64 qu_value;
-        u32 qu_ll_next;
-};
 struct gfs2_quota_change_host {
        u64 qc_change;
        u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
        return error;
 }
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
                  struct gfs2_quota_data **qdp)
 {
        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                spin_unlock(&qd_lru_lock);
-                if (qd || !create) {
+                if (qd) {
                        if (new_qd) {
                                gfs2_glock_put(new_qd->qd_gl);
                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
        qd_put(qd);
 }
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
                    struct gfs2_quota_data **qdp)
 {
        int error;
-        error = qd_get(sdp, user, id, create, qdp);
+        error = qd_get(sdp, user, id, qdp);
        if (error)
                return error;
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
                return 0;
-        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
-        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
        if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
-                error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_USER, uid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        }
        if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
-                error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
        mutex_unlock(&sdp->sd_quota_mutex);
 }
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
-        const struct gfs2_quota *str = buf;
-        qu->qu_limit = be64_to_cpu(str->qu_limit);
-        qu->qu_warn = be64_to_cpu(str->qu_warn);
-        qu->qu_value = be64_to_cpu(str->qu_value);
-        qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
-        struct gfs2_quota *str = buf;
-        str->qu_limit = cpu_to_be64(qu->qu_limit);
-        str->qu_warn = cpu_to_be64(qu->qu_warn);
-        str->qu_value = cpu_to_be64(qu->qu_value);
-        str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
-        memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
 /**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
 *
 * This function was mostly borrowed from gfs2_block_truncate_page which was
 * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
 */
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
-                             s64 change, struct gfs2_quota_data *qd)
+                             s64 change, struct gfs2_quota_data *qd,
+                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, pos;
-        struct buffer_head *bh;
+        struct buffer_head *bh, *dibh;
        struct page *page;
        void *kaddr;
-        char *ptr;
+        struct gfs2_quota *qp;
-        struct gfs2_quota_host qp;
        s64 value;
        int err = -EIO;
+        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        ptr = kaddr + offset;
+        qp = kaddr + offset;
-        gfs2_quota_in(&qp, ptr);
+        value = (s64)be64_to_cpu(qp->qu_value) + change;
-        qp.qu_value += change;
+        qp->qu_value = cpu_to_be64(value);
-        value = qp.qu_value;
+        qd->qd_qb.qb_value = qp->qu_value;
-        gfs2_quota_out(&qp, ptr);
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
-        err = 0;
-        qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+        err = gfs2_meta_inode_buffer(ip, &dibh);
-        qd->qd_qb.qb_value = cpu_to_be64(value);
+        if (err)
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
+                goto unlock;
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+        size = loc + sizeof(struct gfs2_quota);
+        if (size > inode->i_size) {
+                ip->i_disksize = size;
+                i_size_write(inode, size);
+        }
+        inode->i_mtime = inode->i_atime = CURRENT_TIME;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(ip, dibh->b_data);
+        brelse(dibh);
+        mark_inode_dirty(inode);
 unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                return -ENOMEM;
        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+        mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
        for (qx = 0; qx < num_qd; qx++) {
-                error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+                error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE,
                                           GL_NOCACHE, &ghs[qx]);
                if (error)
                        goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        for (x = 0; x < num_qd; x++) {
                qd = qda[x];
                offset = qd2offset(qd);
-                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
-                                          (struct gfs2_quota_data *)
-                                          qd);
                if (error)
                        goto out_end_trans;
@@ -817,21 +818,44 @@ out_gunlock:
 out:
        while (qx--)
                gfs2_glock_dq_uninit(&ghs[qx]);
+        mutex_unlock(&ip->i_inode.i_mutex);
        kfree(ghs);
        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
        return error;
 }
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota q;
+        struct gfs2_quota_lvb *qlvb;
+        loff_t pos;
+        int error;
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        pos = qd2offset(qd);
+        error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+        if (error < 0)
+                return error;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+        qlvb->__pad = 0;
+        qlvb->qb_limit = q.qu_limit;
+        qlvb->qb_warn = q.qu_warn;
+        qlvb->qb_value = q.qu_value;
+        qd->qd_qb = *qlvb;
+        return 0;
+}
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
                    struct gfs2_holder *q_gh)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_holder i_gh;
-        struct gfs2_quota_host q;
-        char buf[sizeof(struct gfs2_quota)];
        int error;
-        struct gfs2_quota_lvb *qlvb;
 restart:
        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
-                loff_t pos;
                gfs2_glock_dq_uninit(q_gh);
-                error = gfs2_glock_nq_init(qd->qd_gl,
+                error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                           GL_NOCACHE, q_gh);
-                                           q_gh);
                if (error)
                        return error;
@@ -853,29 +875,14 @@ restart:
                if (error)
                        goto fail;
-                memset(buf, 0, sizeof(struct gfs2_quota));
+                error = update_qd(sdp, qd);
-                pos = qd2offset(qd);
+                if (error)
-                error = gfs2_internal_read(ip, NULL, buf, &pos,
-                                           sizeof(struct gfs2_quota));
-                if (error < 0)
                        goto fail_gunlock;
                gfs2_glock_dq_uninit(&i_gh);
+                gfs2_glock_dq_uninit(q_gh);
-                gfs2_quota_in(&q, buf);
+                force_refresh = 0;
-                qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+                goto restart;
-                qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
-                qlvb->__pad = 0;
-                qlvb->qb_limit = cpu_to_be64(q.qu_limit);
-                qlvb->qb_warn = cpu_to_be64(q.qu_warn);
-                qlvb->qb_value = cpu_to_be64(q.qu_value);
-                qd->qd_qb = *qlvb;
-                if (gfs2_glock_is_blocking(qd->qd_gl)) {
-                        gfs2_glock_dq_uninit(q_gh);
-                        force_refresh = 0;
-                        goto restart;
-                }
        }
        return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
               sdp->sd_fsname, type,
               (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
               qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
                        print_message(qd, "exceeded");
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
                        error = -EDQUOT;
                        break;
                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                           time_after_eq(jiffies, qd->qd_last_warn +
                                         gfs2_tune_get(sdp,
                                                gt_quota_warn_period) * HZ)) {
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
                        error = print_message(qd, "warning");
                        qd->qd_last_warn = jiffies;
                }
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        }
 }
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_data **qda;
        unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
        unsigned int num_qd;
@@ -1118,7 +1133,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
        struct gfs2_holder q_gh;
        int error;
-        error = qd_get(sdp, user, id, CREATE, &qd);
+        error = qd_get(sdp, user, id, &qd);
        if (error)
                return error;
@@ -1127,7 +1142,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
                gfs2_glock_dq_uninit(&q_gh);
        qd_put(qd);
        return error;
 }
@@ -1298,12 +1312,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
 }
 static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
-                               int (*fxn)(struct gfs2_sbd *sdp),
+                               int (*fxn)(struct super_block *sb, int type),
                               unsigned long t, unsigned long *timeo,
                               unsigned int *new_timeo)
 {
        if (t >= *timeo) {
-                int error = fxn(sdp);
+                int error = fxn(sdp->sd_vfs, 0);
                quotad_error(sdp, msg, error);
                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
        } else {
@@ -1330,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+        if (!sdp->sd_statfs_force_sync) {
+                sdp->sd_statfs_force_sync = 1;
+                wake_up(&sdp->sd_quota_wait);
+        }
+}
 /**
 * gfs2_quotad - Write cached quota changes into the quota file
 * @sdp: Pointer to GFS2 superblock
@@ -1349,8 +1371,15 @@ int gfs2_quotad(void *data)
        while (!kthread_should_stop()) {
                /* Update the master statfs file */
-                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                if (sdp->sd_statfs_force_sync) {
-                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                        int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+                        quotad_error(sdp, "statfs", error);
+                        statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+                }
+                else
+                        quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                           &statfs_timeo,
+                                           &tune->gt_statfs_quantum);
                /* Update quota file */
                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
@@ -1367,7 +1396,7 @@ int gfs2_quotad(void *data)
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
-                if (empty)
+                if (empty && !sdp->sd_statfs_force_sync)
                        t -= schedule_timeout(t);
                else
                        t = 0;
@@ -1377,3 +1406,181 @@ int gfs2_quotad(void *data)
        return 0;
 }
+static int gfs2_quota_get_xstate(struct super_block *sb,
+                                 struct fs_quota_stat *fqs)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        memset(fqs, 0, sizeof(struct fs_quota_stat));
+        fqs->qs_version = FS_QSTAT_VERSION;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+        if (sdp->sd_quota_inode) {
+                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+        }
+        fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+        fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+        fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+        return 0;
+}
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_quota_lvb *qlvb;
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh;
+        int error;
+        memset(fdq, 0, sizeof(struct fs_disk_quota));
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        if (type == USRQUOTA)
+                type = QUOTA_USER;
+        else if (type == GRPQUOTA)
+                type = QUOTA_GROUP;
+        else
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        error = do_glock(qd, FORCE, &q_gh);
+        if (error)
+                goto out;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        fdq->d_version = FS_DQUOT_VERSION;
+        fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        fdq->d_id = id;
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        gfs2_glock_dq_uninit(&q_gh);
+out:
+        qd_put(qd);
+        return error;
+}
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh, i_gh;
+        unsigned int data_blocks, ind_blocks;
+        unsigned int blocks = 0;
+        int alloc_required;
+        struct gfs2_alloc *al;
+        loff_t offset;
+        int error;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        switch(type) {
+        case USRQUOTA:
+                type = QUOTA_USER;
+                if (fdq->d_flags != XFS_USER_QUOTA)
+                        return -EINVAL;
+                break;
+        case GRPQUOTA:
+                type = QUOTA_GROUP;
+                if (fdq->d_flags != XFS_GROUP_QUOTA)
+                        return -EINVAL;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+                return -EINVAL;
+        if (fdq->d_id != id)
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        mutex_lock(&ip->i_inode.i_mutex);
+        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+        if (error)
+                goto out_put;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                goto out_q;
+        /* Check for existing entry, if none then alloc new blocks */
+        error = update_qd(sdp, qd);
+        if (error)
+                goto out_i;
+        /* If nothing has changed, this is a no-op */
+        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if (fdq->d_fieldmask == 0)
+                goto out_i;
+        offset = qd2offset(qd);
+        error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+                                          &alloc_required);
+        if (error)
+                goto out_i;
+        if (alloc_required) {
+                al = gfs2_alloc_get(ip);
+                if (al == NULL)
+                        goto out_i;
+                gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+                                       &data_blocks, &ind_blocks);
+                blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_alloc;
+        }
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        if (error)
+                goto out_release;
+        /* Apply changes */
+        error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+        gfs2_trans_end(sdp);
+out_release:
+        if (alloc_required) {
+                gfs2_inplace_release(ip);
+out_alloc:
+                gfs2_alloc_put(ip);
+        }
+out_i:
+        gfs2_glock_dq_uninit(&i_gh);
+out_q:
+        gfs2_glock_dq_uninit(&q_gh);
+out_put:
+        mutex_unlock(&ip->i_inode.i_mutex);
+        qd_put(qd);
+        return error;
+}
+const struct quotactl_ops gfs2_quotactl_ops = {
+        .quota_sync     = gfs2_quota_sync,
+        .get_xstate     = gfs2_quota_get_xstate,
+        .get_xquota     = gfs2_xquota_get,
+        .set_xquota     = gfs2_xquota_set,
+};
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e8..e271fa07ad02 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                              u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type);
 extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
 extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 extern int gfs2_quotad(void *data);
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 }
 extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d3..4b9bece3d437 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -7,6 +7,7 @@
 * of the GNU General Public License version 2.
 */
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -409,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
        lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
        lh->lh_blkno = cpu_to_be32(lblock);
@@ -593,6 +596,7 @@ fail:
 }
 struct slow_work_ops gfs2_recover_ops = {
+        .owner   = THIS_MODULE,
        .get_ref = gfs2_recover_get_ref,
        .put_ref = gfs2_recover_put_ref,
        .execute = gfs2_recover_work,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6cb..0608f490c295 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1710,11 +1710,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
        struct gfs2_rgrpd *rgd;
        struct gfs2_holder ri_gh, rgd_gh;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+        int ri_locked = 0;
        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
-        if (error)
+                error = gfs2_rindex_hold(sdp, &ri_gh);
-                goto fail;
+                if (error)
+                        goto fail;
+                ri_locked = 1;
+        }
        error = -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1735,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
        gfs2_glock_dq_uninit(&rgd_gh);
 fail_rindex:
-        gfs2_glock_dq_uninit(&ri_gh);
+        if (ri_locked)
+                gfs2_glock_dq_uninit(&ri_gh);
 fail:
        return error;
 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de1..c282ad41f3d1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -70,6 +70,11 @@ enum {
        Opt_commit,
        Opt_err_withdraw,
        Opt_err_panic,
+        Opt_statfs_quantum,
+        Opt_statfs_percent,
+        Opt_quota_quantum,
+        Opt_barrier,
+        Opt_nobarrier,
        Opt_error,
 };
@@ -101,18 +106,23 @@ static const match_table_t tokens = {
        {Opt_commit, "commit=%d"},
        {Opt_err_withdraw, "errors=withdraw"},
        {Opt_err_panic, "errors=panic"},
+        {Opt_statfs_quantum, "statfs_quantum=%d"},
+        {Opt_statfs_percent, "statfs_percent=%d"},
+        {Opt_quota_quantum, "quota_quantum=%d"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_error, NULL}
 };
 /**
 * gfs2_mount_args - Parse mount options
- * @sdp:
+ * @args: The structure into which the parsed options will be written
- * @data:
+ * @options: The options to parse
 *
 * Return: errno
 */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
 {
        char *o;
        int token;
@@ -157,7 +167,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_debug:
                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                       "are mutually exclusive.\n");
                                return -EINVAL;
                        }
@@ -210,7 +220,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                case Opt_commit:
                        rv = match_int(&tmp[0], &args->ar_commit);
                        if (rv || args->ar_commit <= 0) {
-                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_quantum:
+                        rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+                        if (rv || args->ar_statfs_quantum < 0) {
+                                printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_quota_quantum:
+                        rv = match_int(&tmp[0], &args->ar_quota_quantum);
+                        if (rv || args->ar_quota_quantum <= 0) {
+                                printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_percent:
+                        rv = match_int(&tmp[0], &args->ar_statfs_percent);
+                        if (rv || args->ar_statfs_percent < 0 ||
+                            args->ar_statfs_percent > 100) {
+                                printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
@@ -219,15 +251,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_err_panic:
                        if (args->ar_debug) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                        "are mutually exclusive.\n");
                                return -EINVAL;
                        }
                        args->ar_errors = GFS2_ERRORS_PANIC;
                        break;
+                case Opt_barrier:
+                        args->ar_nobarrier = 0;
+                        break;
+                case Opt_nobarrier:
+                        args->ar_nobarrier = 1;
+                        break;
                case Opt_error:
                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
                        return -EINVAL;
                }
        }
@@ -442,7 +480,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 {
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct buffer_head *l_bh;
+        s64 x, y;
+        int need_sync = 0;
        int error;
        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +497,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        l_sc->sc_free += free;
        l_sc->sc_dinodes += dinodes;
        gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+        if (sdp->sd_args.ar_statfs_percent) {
+                x = 100 * l_sc->sc_free;
+                y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+                if (x >= y || x <= -y)
+                        need_sync = 1;
+        }
        spin_unlock(&sdp->sd_statfs_spin);
        brelse(l_bh);
+        if (need_sync)
+                gfs2_wake_up_statfs(sdp);
 }
 void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +533,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +571,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
                goto out_bh2;
        update_statfs(sdp, m_bh, l_bh);
+        sdp->sd_statfs_force_sync = 0;
        gfs2_trans_end(sdp);
@@ -712,8 +763,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        int error;
        flush_workqueue(gfs2_delete_workqueue);
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0);
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
                                   &t_gh);
@@ -1061,8 +1112,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        spin_lock(&gt->gt_spin);
        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_quota_quantum = gt->gt_quota_quantum;
+        if (gt->gt_statfs_slow)
+                args.ar_statfs_quantum = 0;
+        else
+                args.ar_statfs_quantum = gt->gt_statfs_quantum;
        spin_unlock(&gt->gt_spin);
-        error = gfs2_mount_args(sdp, &args, data);
+        error = gfs2_mount_args(&args, data);
        if (error)
                return error;
@@ -1097,8 +1153,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
                sb->s_flags |= MS_POSIXACL;
        else
                sb->s_flags &= ~MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+        else
+                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_quota_quantum = args.ar_quota_quantum;
+        if (args.ar_statfs_quantum) {
+                gt->gt_statfs_slow = 0;
+                gt->gt_statfs_quantum = args.ar_statfs_quantum;
+        }
+        else {
+                gt->gt_statfs_slow = 1;
+                gt->gt_statfs_quantum = 30;
+        }
        spin_unlock(&gt->gt_spin);
        gfs2_online_uevent(sdp);
@@ -1179,7 +1248,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
        struct gfs2_args *args = &sdp->sd_args;
-        int lfsecs;
+        int val;
        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
                seq_printf(s, ",meta");
@@ -1240,9 +1309,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_log_flush_secs;
-        if (lfsecs != 60)
+        if (val != 60)
-                seq_printf(s, ",commit=%d", lfsecs);
+                seq_printf(s, ",commit=%d", val);
+        val = sdp->sd_tune.gt_statfs_quantum;
+        if (val != 30)
+                seq_printf(s, ",statfs_quantum=%d", val);
+        val = sdp->sd_tune.gt_quota_quantum;
+        if (val != 60)
+                seq_printf(s, ",quota_quantum=%d", val);
+        if (args->ar_statfs_percent)
+                seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
        if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
                const char *state;
@@ -1259,6 +1336,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                }
                seq_printf(s, ",errors=%s", state);
        }
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+                seq_printf(s, ",nobarrier");
        return 0;
 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db3682885..3df60f2d84e3 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
                                  const void *buf);
 extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
                          struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d52..c5dad1eb7b91 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -158,7 +158,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        return len;
 }
@@ -171,13 +171,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0);
        return len;
 }
 static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
                                        size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +186,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 1, id);
+        error = gfs2_quota_refresh(sdp, 1, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
                                         size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +201,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 0, id);
+        error = gfs2_quota_refresh(sdp, 0, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee27..912f5cbc4740 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
        return 0;
 }
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-                 struct gfs2_ea_location *el)
+                        struct gfs2_ea_location *el)
 {
        struct ea_find ef;
        int error;
@@ -516,8 +516,8 @@ out:
        return error;
 }
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data, size_t size)
+                            char *data, size_t size)
 {
        int ret;
        size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,6 +534,36 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
        return len;
 }
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+        struct gfs2_ea_location el;
+        int error;
+        int len;
+        char *data;
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                goto out;
+        if (!GFS2_EA_DATA_LEN(el.el_ea))
+                goto out;
+        len = GFS2_EA_DATA_LEN(el.el_ea);
+        data = kmalloc(len, GFP_NOFS);
+        error = -ENOMEM;
+        if (data == NULL)
+                goto out;
+        error = gfs2_ea_get_copy(ip, &el, data, len);
+        if (error == 0)
+                error = len;
+        *ppdata = data;
+out:
+        brelse(el.el_bh);
+        return error;
+}
 /**
 * gfs2_xattr_get - Get a GFS2 extended attribute
 * @inode: The inode
@@ -1259,22 +1289,26 @@ fail:
        return error;
 }
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-                      struct iattr *attr, char *data)
 {
+        struct gfs2_ea_location el;
        struct buffer_head *dibh;
        int error;
-        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+        if (error)
+                return error;
+        if (GFS2_EA_IS_STUFFED(el.el_ea)) {
                error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
                if (error)
                        return error;
-                gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+                gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
-                memcpy(GFS2_EA2DATA(el->el_ea), data,
+                memcpy(GFS2_EA2DATA(el.el_ea), data,
-                       GFS2_EA_DATA_LEN(el->el_ea));
+                       GFS2_EA_DATA_LEN(el.el_ea));
        } else
-                error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+                error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
        if (error)
                return error;
@@ -1507,18 +1541,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name,
        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
 }
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-                                 void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-                                 const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
 static int gfs2_xattr_security_get(struct inode *inode, const char *name,
                                   void *buffer, size_t size)
 {
@@ -1543,12 +1565,6 @@ static struct xattr_handler gfs2_xattr_security_handler = {
        .set    = gfs2_xattr_security_set,
 };
-static struct xattr_handler gfs2_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .get    = gfs2_xattr_system_get,
-        .set    = gfs2_xattr_system_set,
-};
 struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd7743733..8d6ae5813c4d 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,11 +62,7 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-                        struct gfs2_ea_location *el);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                            char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                             struct iattr *attr, char *data);
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be55976..06c1f02de611 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,7 +18,6 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -157,11 +156,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        if (security_inode_alloc(inode))
                goto out;
-        /* allocate and initialize an i_integrity */
-        if (ima_inode_alloc(inode))
-                goto out_free_security;
        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -201,9 +195,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #endif
        return 0;
-out_free_security:
-        security_inode_free(inode);
 out:
        return -ENOMEM;
 }
@@ -235,7 +226,6 @@ static struct inode *alloc_inode(struct super_block *sb)
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
-        ima_inode_free(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..8896c1d4febe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
+                if (flags < 0) {
+                        jbd2_journal_abort(journal, flags);
+                        continue;
+                }
                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
                wbuf[bufs++] = jh2bh(new_jh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..b7ca3a92a4db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -358,6 +359,10 @@ repeat:
                jbd_unlock_bh_state(bh_in);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+                if (!tmp) {
+                        jbd2_journal_put_journal_head(new_jh);
+                        return -ENOMEM;
+                }
                jbd_lock_bh_state(bh_in);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
@@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal)
        if (jbd2_journal_recover(journal))
                goto recovery_error;
+        if (journal->j_failed_commit) {
+                printk(KERN_ERR "JBD2: journal transaction %u on %s "
+                       "is corrupt.\n", journal->j_failed_commit,
+                       journal->j_devname);
+                return -EIO;
+        }
        /* OK, we've finished with the dynamic journal bits:
         * reinitialise the dynamic contents of the superblock in memory
         * and reset them on disk. */
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                spin_unlock(&jffs2_compressor_list_lock);
                break;
        default:
-                printk(KERN_ERR "JFFS2: unknow compression mode.\n");
+                printk(KERN_ERR "JFFS2: unknown compression mode.\n");
        }
 out:
        if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index cfe05c1966a5..3f39be1b0455 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
        /* XXX FIXME: Where a single physical node actually shows up in two
           frags, we read it twice. Don't do that. */
-        /* Now we're pointing at the first frag which overlaps our page */
+        /* Now we're pointing at the first frag which overlaps our page
+         * (or perhaps is before it, if we've been asked to read off the
+         * end of the file). */
        while(offset < end) {
                D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
-                if (unlikely(!frag || frag->ofs > offset)) {
+                if (unlikely(!frag || frag->ofs > offset ||
+                             frag->ofs + frag->size <= offset)) {
                        uint32_t holesize = end - offset;
-                        if (frag) {
+                        if (frag && frag->ofs > offset) {
                                D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
                                holesize = min(holesize, frag->ofs - offset);
                        }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..378991cfe40f 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 * Helper function for jffs2_get_inode_nodes().
 * The function detects whether more data should be read and reads it if yes.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          negative error code on failure.
 */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..4b107881acd5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
 *   is used to release xattr name/value pair and detach from c->xattrindex.
 * reclaim_xattr_datum(c)
 *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
 *   is hard coded as 32KiB.
 * do_verify_xattr_datum(c, xd)
 *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..d9b031cf69f5 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -755,7 +755,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * allocation group.
         */
        if ((blkno & (bmp->db_agsize - 1)) == 0)
-                /* check if the AG is currenly being written to.
+                /* check if the AG is currently being written to.
                 * if so, call dbNextAG() to find a non-busy
                 * AG with sufficient free space.
                 */
@@ -3337,7 +3337,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        for (i = 0, n = 0; i < agno; n++) {
                bmp->db_agfree[n] = 0;  /* init collection point */
-                /* coalesce cotiguous k AGs; */
+                /* coalesce contiguous k AGs; */
                for (j = 0; j < k && i < agno; j++, i++) {
                        /* merge AGi to AGn */
                        bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae14a192..e50cfa3d9654 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -371,82 +371,74 @@ EXPORT_SYMBOL_GPL(lockd_down);
 static ctl_table nlm_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_grace_period",
                .data           = &nlm_grace_period,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_grace_period_min,
                .extra2         = (unsigned long *) &nlm_grace_period_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_timeout",
                .data           = &nlm_timeout,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_timeout_min,
                .extra2         = (unsigned long *) &nlm_timeout_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_udpport",
                .data           = &nlm_udpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_tcpport",
                .data           = &nlm_tcpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_use_hostnames",
                .data           = &nsm_use_hostnames,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_local_state",
                .data           = &nsm_local_state,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_dir[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs",
                .mode           = 0555,
                .child          = nlm_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = nlm_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif  /* CONFIG_SYSCTL */
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd222..7d70d63ceb29 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
+        /* ... and get the mountpoint */
+        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+        if (retval)
+                return retval;
+        retval = security_sb_mount(dev_name, &path,
+                                   type_page, flags, data_page);
+        if (retval)
+                goto dput_out;
        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;
@@ -1945,16 +1955,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
-        /* ... and get the mountpoint */
-        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
-        if (retval)
-                return retval;
-        retval = security_sb_mount(dev_name, &path,
-                                   type_page, flags, data_page);
-        if (retval)
-                goto dput_out;
        if (flags & MS_REMOUNT)
                retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
                                    data_page);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..ec8f45f12e05 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -835,7 +835,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        case NCP_IOC_SETROOT:
                return 0;
        default:
-                /* unkown IOCTL command, assume write */
+                /* unknown IOCTL command, assume write */
                return 1;
        }
 }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 70fad69eb959..fa588006588d 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
        BUG_ON(!cookie);
-        if (fscache_check_page_write(cookie, page)) {
-                if (!(gfp & __GFP_WAIT))
-                        return 0;
-                fscache_wait_on_page_write(cookie, page);
-        }
        if (PageFsCache(page)) {
                dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
                         cookie, page, nfsi);
-                fscache_uncache_page(cookie, page);
+                if (!fscache_maybe_release_page(cookie, page, gfp))
+                        return 0;
                nfs_add_fscache_stats(page->mapping->host,
                                      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
        }
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index b62481dabae9..70e1fbbaaeab 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -22,63 +22,55 @@ static struct ctl_table_header *nfs_callback_sysctl_table;
 static ctl_table nfs_cb_sysctls[] = {
 #ifdef CONFIG_NFS_V4
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs_callback_tcpport",
                .data = &nfs_callback_set_tcpport,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_minmax,
+                .proc_handler = proc_dointvec_minmax,
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_jiffies,
+                .proc_handler = proc_dointvec_jiffies,
-                .strategy = &sysctl_jiffies,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_mountpoint_timeout",
                .data           = &nfs_mountpoint_expiry_timeout,
                .maxlen         = sizeof(nfs_mountpoint_expiry_timeout),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_congestion_kb",
                .data           = &nfs_congestion_kb,
                .maxlen         = sizeof(nfs_congestion_kb),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_dir[] = {
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs",
                .mode = 0555,
                .child = nfs_cb_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_root[] = {
        {
-                .ctl_name = CTL_FS,
                .procname = "fs",
                .mode = 0555,
                .child = nfs_cb_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 int nfs_register_sysctl(void)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..c84b5cc1a943 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
                return FLUSH_HIGHPRI | FLUSH_STABLE;
-        if (wbc->for_kupdate)
+        if (wbc->for_kupdate || wbc->for_background)
                return FLUSH_LOWPRI;
        return 0;
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330c..5ef5f365a5c8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -69,36 +69,30 @@ static int zero;
 ctl_table inotify_table[] = {
        {
-                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
                .procname       = "max_user_instances",
                .data           = &inotify_max_user_instances,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
                .procname       = "max_user_watches",
                .data           = &inotify_max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
                .procname       = "max_queued_events",
                .data           = &inotify_max_queued_events,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -747,10 +741,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        /* create/update an inode mark */
        ret = inotify_update_watch(group, inode, mask);
-        if (unlikely(ret))
-                goto path_put_and_out;
-path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..08f7530e9341 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -927,7 +927,7 @@ lock_retry_remap:
                return 0;
        ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-                        "EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+                        "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
        return err < 0 ? err : -EIO;
 read_err:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..43179ddd336f 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -399,7 +399,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * @cached_page: allocated but as yet unused page
 * @lru_pvec:   lru-buffering pagevec of caller
 *
- * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
 * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1281,7 @@ rl_not_mapped_enoent:
 /*
 * Copy as much as we can into the pages and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the pages
+ * were successfully copied.  If a fault is encountered then clear the pages
 * out to (ofs + bytes) and return the number of bytes which were copied.
 */
 static inline size_t ntfs_copy_from_user(struct page **pages,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
 * copy of the complete multi sector transfer deprotected page.  On failure,
 * *@wrp is undefined.
 *
- * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
 * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
 *
 * The following error codes are defined:
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 9ef85e628fe1..79a89184cb5e 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -36,12 +36,11 @@
 /* Definition of the ntfs sysctl. */
 static ctl_table ntfs_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,       /* Binary and text IDs. */
                .procname       = "ntfs-debug",
                .data           = &debug_msgs,          /* Data pointer and size. */
                .maxlen         = sizeof(debug_msgs),
                .mode           = 0644,                 /* Mode, proc handler. */
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -49,7 +48,6 @@ static ctl_table ntfs_sysctls[] = {
 /* Define the parent directory /proc/sys/fs. */
 static ctl_table sysctls_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = ntfs_sysctls
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..7c7198a5bc90 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2398,7 +2398,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
 *
 * The array is assumed to be large enough to hold an entire path (tree depth).
 *
- * Upon succesful return from this function:
+ * Upon successful return from this function:
 *
 * - The 'right_path' array will contain a path to the leaf block
 *   whose range contains e_cpos.
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
 * Calculate the bit offset in the hamming code buffer based on the bit's
 * offset in the data buffer.  Since the hamming code reserves all
 * power-of-two bits for parity, the data bit number and the code bit
- * number are offest by all the parity bits beforehand.
+ * number are offset by all the parity bits beforehand.
 *
 * Recall that bit numbers in hamming code are 1-based.  This function
 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                if (sc->sc_sock) {
                        inet = inet_sk(sc->sc_sock->sk);
                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->saddr;
+                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->daddr;
+                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->sport;
+                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->dport;
+                        dport = (__force __be16)inet->inet_dport;
                }
                /* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..03ccf9a7b1f4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2586,7 +2586,7 @@ fail:
         * is complete everywhere.  if the target dies while this is
         * going on, some nodes could potentially see the target as the
         * master, so it is important that my recovery finds the migration
-         * mle and sets the master to UNKNONWN. */
+         * mle and sets the master to UNKNOWN. */
        /* wait for new node to assert master */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..c5e4a49e3a12 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1855,7 +1855,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * outstanding lock request, so a cancel convert is
                 * required. We intentionally overwrite 'ret' - if the
                 * cancel fails and the lock was granted, it's easier
-                 * to just bubble sucess back up to the user.
+                 * to just bubble success back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
        } else if (!ret && (level > lockres->l_level)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..bf34c491ae96 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
        default:
                status = -EINVAL;
-                mlog(ML_ERROR, "Uknown access type!\n");
+                mlog(ML_ERROR, "Unknown access type!\n");
        }
        if (!status && ocfs2_meta_ecc(osb) && triggers)
                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..30967e3f5e43 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2431,7 +2431,7 @@ out:
 * we gonna touch and whether we need to create new blocks.
 *
 * Normally the refcount blocks store these refcount should be
- * continguous also, so that we can get the number easily.
+ * contiguous also, so that we can get the number easily.
 * As for meta_ac, we will at most add split 2 refcount record and
 * 2 more refcount block, so just check it in a rough way.
 *
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b6..f3df0baa9a48 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -620,51 +620,46 @@ error:
 static ctl_table ocfs2_nm_table[] = {
        {
-                .ctl_name       = 1,
                .procname       = "hb_ctl_path",
                .data           = ocfs2_hb_ctl_path,
                .maxlen         = OCFS2_MAX_HB_CTL_PATH,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table ocfs2_mod_table[] = {
        {
-                .ctl_name       = FS_OCFS2_NM,
                .procname       = "nm",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_nm_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_kern_table[] = {
        {
-                .ctl_name       = FS_OCFS2,
                .procname       = "ocfs2",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_mod_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_kern_table
        },
-        { .ctl_name = 0 }
+        { }
 };
 static struct ctl_table_header *ocfs2_table_header = NULL;
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
 }
 /*
- * Tries to allocate exactly one block.  Returns true if sucessful.
+ * Tries to allocate exactly one block.  Returns true if successful.
 */
 int omfs_allocate_block(struct super_block *sb, u64 block)
 {
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c6..b4b31d277f3a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        error = -EPERM;
        if (!capable(CAP_SYS_CHROOT))
                goto dput_and_out;
+        error = security_path_chroot(&path);
+        if (error)
+                goto dput_and_out;
        set_fs_root(current->fs, &path);
        error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
+        err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+        if (err)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        err = notify_change(dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(file->f_path.mnt);
 out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
        if (error)
                goto dput_and_out;
        mutex_lock(&inode->i_mutex);
+        error = security_path_chmod(path.dentry, path.mnt, mode);
+        if (error)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path.dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(path.mnt);
 dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
        return sys_fchmodat(AT_FDCWD, filename, mode);
 }
-static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
+static int chown_common(struct path *path, uid_t user, gid_t group)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = path->dentry->d_inode;
        int error;
        struct iattr newattrs;
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
        mutex_lock(&inode->i_mutex);
-        error = notify_change(dentry, &newattrs);
+        error = security_path_chown(path, user, group);
+        if (!error)
+                error = notify_change(path->dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
        return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
                goto out_fput;
        dentry = file->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = chown_common(dentry, user, group);
+        error = chown_common(&file->f_path, user, group);
        mnt_drop_write(file->f_path.mnt);
 out_fput:
        fput(file);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..64bc8998ac9a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
+ssize_t part_discard_alignment_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%u\n", p->discard_alignment);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+                   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_start.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
+        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+        p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+                                                              start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..49cfd5f54238 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
 /************************************************************
 * EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
+ *
- * http://developer.intel.com/technology/efi/efi.htm
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
 *   Copyright 2000,2001,2002,2004 Dell Inc.
 *
@@ -92,6 +94,7 @@
 *
 ************************************************************/
 #include <linux/crc32.h>
+#include <linux/math64.h>
 #include "check.h"
 #include "efi.h"
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
-        return (bdev->bd_inode->i_size >> 9) - 1ULL;
+        return div_u64(bdev->bd_inode->i_size,
+                       bdev_logical_block_size(bdev)) - 1ULL;
 }
 static inline int
@@ -188,6 +192,7 @@ static size_t
 read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
        if (!bdev || !buffer || lba > last_lba(bdev))
                return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+                unsigned char *data = read_dev_sector(bdev, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
 alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 {
        gpt_header *gpt;
+        unsigned ssz = bdev_logical_block_size(bdev);
        if (!bdev)
                return NULL;
-        gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt,
+        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
-                     sizeof (gpt_header)) < sizeof (gpt_header)) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
+        unsigned ssz = bdev_logical_block_size(bdev) / 512;
        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+                u64 start = le64_to_cpu(ptes[i].starting_lba);
+                u64 size = le64_to_cpu(ptes[i].ending_lba) -
+                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
                        continue;
-                put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
+                put_partition(state, i+1, start * ssz, size * ssz);
-                                 (le64_to_cpu(ptes[i].ending_lba) -
-                                  le64_to_cpu(ptes[i].starting_lba) +
-                                  1ULL));
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
        __le32 num_partition_entries;
        __le32 sizeof_partition_entry;
        __le32 partition_entry_array_crc32;
-        u8 reserved2[GPT_BLOCK_SIZE - 92];
+        /* The rest of the logical block is reserved by UEFI and must be zero.
+         * EFI standard handles this by:
+         *
+         * uint8_t              reserved2[ BlockSize - 92 ];
+         */
 } __attribute__ ((packed)) gpt_header;
 typedef struct _gpt_entry_attributes {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d506518..4badde179b18 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,6 +410,16 @@ static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
 }
 #endif          /* CONFIG_MMU */
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+        seq_printf(m, "Cpus_allowed:\t");
+        seq_cpumask(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_cpumask_list(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+}
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
@@ -424,6 +434,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        }
        task_sig(m, task);
        task_cap(m, task);
+        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
        task_show_regs(m, task);
@@ -495,20 +506,17 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                /* add up live thread stats at the group level */
                if (whole) {
-                        struct task_cputime cputime;
                        struct task_struct *t = task;
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                                gtime = cputime_add(gtime, task_gtime(t));
+                                gtime = cputime_add(gtime, t->gtime);
                                t = next_thread(t);
                        } while (t != task);
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        thread_group_cputime(task, &cputime);
+                        thread_group_times(task, &utime, &stime);
-                        utime = cputime.utime;
-                        stime = cputime.stime;
                        gtime = cputime_add(gtime, sig->gtime);
                }
@@ -524,9 +532,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                utime = task_utime(task);
+                task_times(task, &utime, &stime);
-                stime = task_stime(task);
+                gtime = task->gtime;
-                gtime = task_gtime(task);
        }
        /* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
        int len;
-        for ( ; p->ctl_name || p->procname; p++) {
+        for ( ; p->procname; p++) {
                if (!p->procname)
                        continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
                void *dirent, filldir_t filldir)
 {
-        for (; table->ctl_name || table->procname; table++, (*pos)++) {
+        for (; table->procname; table++, (*pos)++) {
                int res;
                /* Can't do anything without a proc name */
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
        int i, j;
        unsigned long jif;
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-        cputime64_t guest;
+        cputime64_t guest, guest_nice;
        u64 sum = 0;
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
-        guest = cputime64_zero;
+        guest = guest_nice = cputime64_zero;
        getboottime(&boottime);
        jif = boottime.tv_sec;
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                guest_nice = cputime64_add(guest_nice,
+                        kstat_cpu(i).cpustat.guest_nice);
                for_each_irq_nr(j) {
                        sum += kstat_irqs_cpu(j, i);
                }
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
        }
        sum += arch_irq_stat();
-        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                "%llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
                (unsigned long long)cputime64_to_clock_t(nice),
                (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(irq),
                (unsigned long long)cputime64_to_clock_t(softirq),
                (unsigned long long)cputime64_to_clock_t(steal),
-                (unsigned long long)cputime64_to_clock_t(guest));
+                (unsigned long long)cputime64_to_clock_t(guest),
+                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = kstat_cpu(i).cpustat.softirq;
                steal = kstat_cpu(i).cpustat.steal;
                guest = kstat_cpu(i).cpustat.guest;
+                guest_nice = kstat_cpu(i).cpustat.guest_nice;
                seq_printf(p,
-                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                        "%llu\n",
                        i,
                        (unsigned long long)cputime64_to_clock_t(user),
                        (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
                        (unsigned long long)cputime64_to_clock_t(irq),
                        (unsigned long long)cputime64_to_clock_t(softirq),
                        (unsigned long long)cputime64_to_clock_t(steal),
-                        (unsigned long long)cputime64_to_clock_t(guest));
+                        (unsigned long long)cputime64_to_clock_t(guest),
+                        (unsigned long long)cputime64_to_clock_t(guest_nice));
        }
        seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d567..32f5d131a644 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -67,7 +67,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
        while (total < size) {
                if ((bh = sb_bread(sb, start + offset)) == NULL) {
-                        printk("qnx4: I/O error in counting free blocks\n");
+                        printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
                        break;
                }
                count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb1398..6f30c3d5bcbf 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int ix, ino;
        int size;
-        QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-        QNX4DEBUG(("filp->f_pos         = %ld\n", (long) filp->f_pos));
+        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
        lock_kernel();
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                        size = QNX4_NAME_MAX;
                                if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-                                        QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname));
+                                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
                                        if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
                                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
                                        else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c4..449f5a66dd34 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -107,7 +107,7 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
 {
        unsigned long phys;
-        QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+        QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
        phys = qnx4_block_map( inode, iblock );
        if ( phys ) {
@@ -142,12 +142,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                                // read next xtnt block.
                                bh = sb_bread(inode->i_sb, i_xblk - 1);
                                if ( !bh ) {
-                                        QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+                                        QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
                                        return -EIO;
                                }
                                xblk = (struct qnx4_xblk*)bh->b_data;
                                if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
-                                        QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+                                        QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
                                        return -EIO;
                                }
                        }
@@ -168,7 +168,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                        brelse( bh );
        }
-        QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+        QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
        return block;
 }
@@ -209,7 +209,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
                return "no qnx4 filesystem (no root dir).";
        } else {
-                QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id));
+                QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
                rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
                rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
                for (j = 0; j < rl; j++) {
@@ -220,7 +220,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
                        for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
                                rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
                                if (rootdir->di_fname != NULL) {
-                                        QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname));
+                                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
                                        if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
                                                found = 1;
                                                qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
@@ -265,12 +265,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
           if we don't belong here... */
        bh = sb_bread(s, 1);
        if (!bh) {
-                printk("qnx4: unable to read the superblock\n");
+                printk(KERN_ERR "qnx4: unable to read the superblock\n");
                goto outnobh;
        }
        if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
                if (!silent)
-                        printk("qnx4: wrong fsid in superblock.\n");
+                        printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
                goto out;
        }
        s->s_op = &qnx4_sops;
@@ -284,14 +284,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        errmsg = qnx4_checkroot(s);
        if (errmsg != NULL) {
                if (!silent)
-                        printk("qnx4: %s\n", errmsg);
+                        printk(KERN_ERR "qnx4: %s\n", errmsg);
                goto out;
        }
        /* does root not have inode number QNX4_ROOT_INO ?? */
        root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
        if (IS_ERR(root)) {
-                printk("qnx4: get inode failed\n");
+                printk(KERN_ERR "qnx4: get inode failed\n");
                ret = PTR_ERR(root);
                goto out;
        }
@@ -374,7 +374,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        qnx4_inode = qnx4_raw_inode(inode);
        inode->i_mode = 0;
-        QNX4DEBUG(("Reading inode : [%d]\n", ino));
+        QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
        if (!ino) {
                printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
                                "out of range\n",
@@ -385,7 +385,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        block = ino / QNX4_INODES_PER_BLOCK;
        if (!(bh = sb_bread(sb, block))) {
-                printk("qnx4: major problem: unable to read inode from dev "
+                printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
                       "%s\n", sb->s_id);
                iget_failed(inode);
                return ERR_PTR(-EIO);
@@ -499,7 +499,7 @@ static int __init init_qnx4_fs(void)
                return err;
        }
-        printk("QNX4 filesystem 0.2.3 registered.\n");
+        printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
        return 0;
 }
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd6..58703ebba879 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
        int namelen, thislen;
        if (bh == NULL) {
-                printk("qnx4: matching unassigned buffer !\n");
+                printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
                return 0;
        }
        de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
        *res_dir = NULL;
        if (!dir->i_sb) {
-                printk("qnx4: no superblock on dir.\n");
+                printk(KERN_WARNING "qnx4: no superblock on dir.\n");
                return NULL;
        }
        bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        foundinode = qnx4_iget(dir->i_sb, ino);
        if (IS_ERR(foundinode)) {
                unlock_kernel();
-                QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n",
+                QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
                           PTR_ERR(foundinode)));
                return ERR_CAST(foundinode);
        }
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..353e78a9ebee 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
 config QUOTA_NETLINK_INTERFACE
        bool "Report quota messages through netlink interface"
-        depends on QUOTA && NET
+        depends on QUOTACTL && NET
        help
          If you say Y here, quota warnings (about exceeding softlimit, reaching
          hardlimit, etc.) will be reported through netlink interface. If unsure,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..eb5a755718f6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
 #include <asm/uaccess.h>
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-        .id = GENL_ID_GENERATE,
-        .hdrsize = 0,
-        .name = "VFS_DQUOT",
-        .version = 1,
-        .maxattr = QUOTA_NL_A_MAX,
-};
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
-        static atomic_t seq;
-        struct sk_buff *skb;
-        void *msg_head;
-        int ret;
-        int msg_size = 4 * nla_total_size(sizeof(u32)) +
-                       2 * nla_total_size(sizeof(u64));
-        /* We have to allocate using GFP_NOFS as we are called from a
-         * filesystem performing write and thus further recursion into
-         * the fs to free some data could cause deadlocks. */
-        skb = genlmsg_new(msg_size, GFP_NOFS);
-        if (!skb) {
-                printk(KERN_ERR
-                  "VFS: Not enough memory to send quota warning.\n");
-                return;
-        }
-        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
-        if (!msg_head) {
-                printk(KERN_ERR
-                  "VFS: Cannot store netlink header in quota warning.\n");
-                goto err_out;
-        }
-        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
-                MAJOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
-                MINOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-        if (ret)
-                goto attr_err_out;
-        genlmsg_end(skb, msg_head);
-        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-        return;
-attr_err_out:
-        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-        kfree_skb(skb);
-}
-#endif
 /*
 * Write warnings to the console and send warning messages over netlink.
 *
@@ -1145,18 +1074,20 @@ err_out:
 */
 static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
+        struct dquot *dq;
        int i;
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < MAXQUOTAS; i++) {
-                if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
+                dq = dquots[i];
-                    !warning_issued(dquots[i], warntype[i])) {
+                if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+                    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-                        print_warning(dquots[i], warntype[i]);
+                        print_warning(dq, warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-                        send_warning(dquots[i], warntype[i]);
 #endif
+                        quota_send_warning(dq->dq_type, dq->dq_id,
+                                           dq->dq_sb->s_dev, warntype[i]);
                }
+        }
 }
 static int ignore_hardlimit(struct dquot *dquot)
@@ -2473,100 +2404,89 @@ const struct quotactl_ops vfs_quotactl_ops = {
 static ctl_table fs_dqstats_table[] = {
        {
-                .ctl_name       = FS_DQ_LOOKUPS,
                .procname       = "lookups",
                .data           = &dqstats.lookups,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_DROPS,
                .procname       = "drops",
                .data           = &dqstats.drops,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_READS,
                .procname       = "reads",
                .data           = &dqstats.reads,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_WRITES,
                .procname       = "writes",
                .data           = &dqstats.writes,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_CACHE_HITS,
                .procname       = "cache_hits",
                .data           = &dqstats.cache_hits,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_ALLOCATED,
                .procname       = "allocated_dquots",
                .data           = &dqstats.allocated_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_FREE,
                .procname       = "free_dquots",
                .data           = &dqstats.free_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_SYNCS,
                .procname       = "syncs",
                .data           = &dqstats.syncs,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
-                .ctl_name       = FS_DQ_WARNINGS,
                .procname       = "warnings",
                .data           = &flag_print_warnings,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = FS_DQSTATS,
                .procname       = "quota",
                .mode           = 0555,
                .child          = fs_dqstats_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table sys_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = fs_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static int __init dquot_init(void)
@@ -2607,12 +2527,6 @@ static int __init dquot_init(void)
        register_shrinker(&dqcache_shrinker);
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-        if (genl_register_family(&quota_genl_family) != 0)
-                printk(KERN_ERR
-                       "VFS: Failed to create quota netlink interface.\n");
-#endif
        return 0;
 }
 module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..ee91e2756950 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
 /* Check validity of generic quotactl commands */
 static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
        return ret;
 }
 #endif
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+        .id = GENL_ID_GENERATE,
+        .hdrsize = 0,
+        .name = "VFS_DQUOT",
+        .version = 1,
+        .maxattr = QUOTA_NL_A_MAX,
+};
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+                        const char warntype)
+{
+        static atomic_t seq;
+        struct sk_buff *skb;
+        void *msg_head;
+        int ret;
+        int msg_size = 4 * nla_total_size(sizeof(u32)) +
+                       2 * nla_total_size(sizeof(u64));
+        /* We have to allocate using GFP_NOFS as we are called from a
+         * filesystem performing write and thus further recursion into
+         * the fs to free some data could cause deadlocks. */
+        skb = genlmsg_new(msg_size, GFP_NOFS);
+        if (!skb) {
+                printk(KERN_ERR
+                  "VFS: Not enough memory to send quota warning.\n");
+                return;
+        }
+        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+        if (!msg_head) {
+                printk(KERN_ERR
+                  "VFS: Cannot store netlink header in quota warning.\n");
+                goto err_out;
+        }
+        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+        if (ret)
+                goto attr_err_out;
+        genlmsg_end(skb, msg_head);
+        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+        return;
+attr_err_out:
+        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+        kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+static int __init quota_init(void)
+{
+        if (genl_register_family(&quota_genl_family) != 0)
+                printk(KERN_ERR
+                       "VFS: Failed to create quota netlink interface.\n");
+        return 0;
+};
+module_init(quota_init);
+#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..b7f4a1f94d48 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (!(out_file->f_mode & FMODE_WRITE))
                goto fput_out;
        retval = -EINVAL;
-        if (!out_file->f_op || !out_file->f_op->sendpage)
-                goto fput_out;
        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..6a9e30c041dd 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                 item_ops.o ioctl.o procfs.o xattr.o
+                 item_ops.o ioctl.o procfs.o xattr.o lock.o
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..685495707181 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+        reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
+        reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
+                        reiserfs_write_unlock(sb);
                        __wait_on_buffer(bh);
+                        reiserfs_write_lock(sb);
                }
                BUG_ON(!buffer_uptodate(bh));
                BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..c094f58c7448 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                // user space buffer is swapped out. At that time
                                // entry can move to somewhere else
                                memcpy(local_buf, d_name, d_reclen);
+                                /*
+                                 * Since filldir might sleep, we can release
+                                 * the write lock here for other waiters
+                                 */
+                                reiserfs_write_unlock(inode->i_sb);
                                if (filldir
                                    (dirent, local_buf, d_reclen, d_off, d_ino,
                                     DT_UNKNOWN) < 0) {
+                                        reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
                                        }
                                        goto end;
                                }
+                                reiserfs_write_lock(inode->i_sb);
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa5..60c080440661 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#ifdef CONFIG_REISERFS_CHECK
-struct tree_balance *cur_tb = NULL;     /* detects whether more than one
-                                           copy of tb exists as a means
-                                           of checking whether schedule
-                                           is interrupting do_balance */
-#endif
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                         struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
        int retval = 0;
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
                               "occurred based on cur_tb not being null at "
                               "this point in code. do_balance cannot properly "
-                               "handle schedule occurring while it runs.");
+                               "handle concurrent tree accesses on a same "
+                               "mount point.");
        }
        /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
             "check");*/
        RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-        cur_tb = tb;
+        REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
        check_leaf_level(tb);
        check_internal_levels(tb);
-        cur_tb = NULL;
+        REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
        /* reiserfs_free_block is no longer schedule safe.  So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f8..da2dba082e2d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 const struct file_operations reiserfs_file_operations = {
        .read = do_sync_read,
        .write = reiserfs_file_write,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..6591cb21edf6 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
        return needed_nodes;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Set parameters for balancing.
 * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +831,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
                RFALSE(buffer_dirty(new_bh) ||
                       buffer_journaled(new_bh) ||
                       buffer_journal_dirty(new_bh),
-                       "PAP-8140: journlaled or dirty buffer %b for the new block",
+                       "PAP-8140: journaled or dirty buffer %b for the new block",
                       new_bh);
                /* Put empty buffers into the array. */
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
        /* Check whether the common parent is locked. */
        if (buffer_locked(*pcom_father)) {
+                /* Release the write lock while the buffer is busy */
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(*pcom_father);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb)) {
                        brelse(*pcom_father);
                        return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
                return REPEAT_SEARCH;
        if (buffer_locked(bh)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(bh);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
                                                                       FL[h]);
                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                child_position =
                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
                                    REPEAT_SEARCH : CARRY_ON;
                        }
 #endif
+                        reiserfs_write_unlock(tb->tb_sb);
                        __wait_on_buffer(locked);
+                        reiserfs_write_lock(tb->tb_sb);
                        if (FILESYSTEM_CHANGED_TB(tb))
                                return REPEAT_SEARCH;
                }
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
        /* if it possible in indirect_to_direct conversion */
        if (buffer_locked(tbS0)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(tbS0);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
 #ifdef CONFIG_REISERFS_CHECK
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                print_cur_tb("fix_nodes");
                reiserfs_panic(tb->tb_sb, "PAP-8305",
                               "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..3a28e7751b3c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        struct cpu_key key;
        struct buffer_head *bh;
        struct item_head *ih, tmp_ih;
-        int fs_gen;
        b_blocknr_t blocknr;
        char *p = NULL;
        int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
-      research:
        result = search_for_position_by_key(inode->i_sb, &key, &path);
        if (result != POSITION_FOUND) {
                pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        }
        // read file tail into part of page
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-        fs_gen = get_generation(inode->i_sb);
        copy_item_head(&tmp_ih, ih);
        /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
         ** sure we need to.  But, this means the item might move if
         ** kmap schedules
         */
-        if (!p) {
+        if (!p)
                p = (char *)kmap(bh_result->b_page);
-                if (fs_changed(fs_gen, inode->i_sb)
-                    && item_moved(&tmp_ih, &path)) {
-                        goto research;
-                }
-        }
        p += offset;
        memset(p, 0, inode->i_sb->s_blocksize);
        do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
           disappeared */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
-                lock_kernel();
+                reiserfs_write_lock(inode->i_sb);
                err = reiserfs_commit_for_inode(inode);
                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-                unlock_kernel();
+                reiserfs_write_unlock(inode->i_sb);
                if (err < 0)
                        ret = err;
        }
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        __le32 *item;
        int done;
        int fs_gen;
+        int lock_depth;
        struct reiserfs_transaction_handle *th = NULL;
        /* space reserved in transaction batch:
           . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        loff_t new_offset =
            (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-        /* bad.... */
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
        version = get_inode_item_key_version(inode);
        if (!file_capable(inode, block)) {
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return -EFBIG;
        }
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                /* find number of block-th logical block of the file */
                ret = _get_block_create_0(inode, block, bh_result,
                                          create | GET_BLOCK_READ_DIRECT);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return ret;
        }
        /*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                if (!dangle && th)
                        retval = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                /* the item was found, so new blocks were not added to the file
                 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (blocks_needed == 1) {
                                un = &unf_single;
                        } else {
-                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);      // We need to avoid scheduling.
+                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
                                if (!un) {
                                        un = &unf_single;
                                        blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (retval)
                                goto failure;
                }
-                /* inserting indirect pointers for a hole can take a
+                /*
-                 ** long time.  reschedule if needed
+                 * inserting indirect pointers for a hole can take a
+                 * long time.  reschedule if needed and also release the write
+                 * lock for others.
                 */
-                cond_resched();
+                if (need_resched()) {
+                        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+                        schedule();
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                }
                retval = search_for_position_by_key(inode->i_sb, &key, &path);
                if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        retval = err;
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        reiserfs_check_path(&path);
        return retval;
 }
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
        int error;
        struct buffer_head *bh = NULL;
        int err2;
+        int lock_depth;
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return 0;
      out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return error;
 }
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
        int ret;
        int old_ref = 0;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        fix_tail_page_for_writing(page);
        if (reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th;
        unsigned start;
+        int lock_depth = 0;
+        bool locked = false;
        if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
                pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** to do the i_size updates here.
         */
        pos += copied;
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
+                lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                locked = true;
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
+                if (!locked) {
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                        locked = true;
+                }
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
      out:
+        if (locked)
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
        return ret == 0 ? copied : ret;
      journal_error:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+        locked = false;
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        goto out;
 }
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th = NULL;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        if (reiserfs_transaction_running(inode->i_sb)) {
                th = current->journal_info;
        }
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
      journal_error:
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        return ret;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..ace77451ceb1 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
 #include <linux/compat.h>
 /*
-** reiserfs_ioctl - handler for ioctl for inode
+ * reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
+ * supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
+ *                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
+ *  3) That's all for a while ...
-*/
+ */
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                   unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int err = 0;
+        reiserfs_write_lock(inode->i_sb);
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
                if (S_ISREG(inode->i_mode)) {
                        if (arg)
-                                return reiserfs_unpack(inode, filp);
+                                err = reiserfs_unpack(inode, filp);
-                        else
-                                return 0;
                } else
-                        return -ENOTTY;
+                        err = -ENOTTY;
-                /* following two cases are taken from fs/ext2/ioctl.c by Remy
+                break;
-                   Card (card@masi.ibp.fr) */
+                /*
+                 * following two cases are taken from fs/ext2/ioctl.c by Remy
+                 * Card (card@masi.ibp.fr)
+                 */
        case REISERFS_IOC_GETFLAGS:
-                if (!reiserfs_attrs(inode->i_sb))
+                if (!reiserfs_attrs(inode->i_sb)) {
-                        return -ENOTTY;
+                        err = -ENOTTY;
+                        break;
+                }
                flags = REISERFS_I(inode)->i_attrs;
                i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-                return put_user(flags, (int __user *)arg);
+                err = put_user(flags, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETFLAGS:{
-                        if (!reiserfs_attrs(inode->i_sb))
+                        if (!reiserfs_attrs(inode->i_sb)) {
-                                return -ENOTTY;
+                                err = -ENOTTY;
+                                break;
+                        }
                        err = mnt_want_write(filp->f_path.mnt);
                        if (err)
-                                return err;
+                                break;
                        if (!is_owner_or_cap(inode)) {
                                err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        mark_inode_dirty(inode);
 setflags_out:
                        mnt_drop_write(filp->f_path.mnt);
-                        return err;
+                        break;
                }
        case REISERFS_IOC_GETVERSION:
-                return put_user(inode->i_generation, (int __user *)arg);
+                err = put_user(inode->i_generation, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
-                        return -EPERM;
+                        err = -EPERM;
+                        break;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
-                        return err;
+                        break;
                if (get_user(inode->i_generation, (int __user *)arg)) {
                        err = -EFAULT;
                        goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
                mark_inode_dirty(inode);
 setversion_out:
                mnt_drop_write(filp->f_path.mnt);
-                return err;
+                break;
        default:
-                return -ENOTTY;
+                err = -ENOTTY;
        }
+        reiserfs_write_unlock(inode->i_sb);
+        return err;
 }
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
        default:
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
-        ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+        return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..2f8a7e7b8dab 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
        clear_buffer_journal_restore_dirty(bh);
 }
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-        if (current->lock_depth < 0) {
-                reiserfs_panic(sb, "journal-1", "%s called without kernel "
-                               "lock held", caller);
-        }
-#else
-        ;
-#endif
-}
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
                                                                  super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
        PROC_INFO_INC(sb, journal.lock_journal);
-        mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+        reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 /* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
                disable_barrier(s);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
+                reiserfs_write_unlock(s);
                sync_dirty_buffer(bh);
+                reiserfs_write_lock(s);
        }
 }
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
-        if (atomic_read(&j->j_async_throttle))
+        if (atomic_read(&j->j_async_throttle)) {
+                reiserfs_write_unlock(s);
                congestion_wait(BLK_RW_ASYNC, HZ / 10);
+                reiserfs_write_lock(s);
+        }
        return 0;
 }
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
        }
        /* make sure nobody is trying to flush this one at the same time */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
        if (!journal_list_still_alive(s, trans_id)) {
                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
        if (!list_empty(&jl->j_bh_list)) {
                int ret;
-                unlock_kernel();
+                /*
+                 * We might sleep in numerous places inside
+                 * write_ordered_buffers. Relax the write lock.
+                 */
+                reiserfs_write_unlock(s);
                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                            journal, jl, &jl->j_bh_list);
                if (ret < 0 && retval == 0)
                        retval = ret;
-                lock_kernel();
+                reiserfs_write_lock(s);
        }
        BUG_ON(!list_empty(&jl->j_bh_list));
        /*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (tbh) {
-                        if (buffer_dirty(tbh))
+                        if (buffer_dirty(tbh)) {
-                            ll_rw_block(WRITE, 1, &tbh) ;
+                            reiserfs_write_unlock(s);
+                            ll_rw_block(WRITE, 1, &tbh);
+                            reiserfs_write_lock(s);
+                        }
                        put_bh(tbh) ;
                }
        }
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
+                reiserfs_write_unlock(s);
                wait_on_buffer(tbh);
+                reiserfs_write_lock(s);
                // since we're using ll_rw_blk above, it might have skipped over
                // a locked buffer.  Double check here
                //
-                if (buffer_dirty(tbh))  /* redundant, sync_dirty_buffer() checks */
+                /* redundant, sync_dirty_buffer() checks */
+                if (buffer_dirty(tbh)) {
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(tbh);
+                        reiserfs_write_lock(s);
+                }
                if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
                        reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
                        if (buffer_dirty(jl->j_commit_bh))
                                BUG();
                        mark_buffer_dirty(jl->j_commit_bh) ;
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(jl->j_commit_bh) ;
+                        reiserfs_write_lock(s);
                }
-        } else
+        } else {
+                reiserfs_write_unlock(s);
                wait_on_buffer(jl->j_commit_bh);
+                reiserfs_write_lock(s);
+        }
        check_barrier_completion(s, jl->j_commit_bh);
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
        if (trans_id >= journal->j_last_flush_trans_id) {
                if (buffer_locked((journal->j_header_bh))) {
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer((journal->j_header_bh));
+                        reiserfs_write_lock(sb);
                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
                                reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
                                disable_barrier(sb);
                                goto sync;
                        }
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                        check_barrier_completion(sb, journal->j_header_bh);
                } else {
                      sync:
                        set_buffer_dirty(journal->j_header_bh);
+                        reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                }
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-                mutex_lock(&journal->j_flush_mutex);
+                reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
                                        reiserfs_panic(s, "journal-1011",
                                                       "cn->bh is NULL");
                                }
+                                reiserfs_write_unlock(s);
                                wait_on_buffer(cn->bh);
+                                reiserfs_write_lock(s);
                                if (!cn->bh) {
                                        reiserfs_panic(s, "journal-1012",
                                                       "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
-        mutex_lock(&journal->j_flush_mutex);
+        reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        reiserfs_mounted_fs_count--;
        /* wait for all commits to finish */
        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+        /*
+         * We must release the write lock here because
+         * the workqueue job (flush_async_commit) needs this lock
+         */
+        reiserfs_write_unlock(sb);
        flush_workqueue(commit_wq);
        if (!reiserfs_mounted_fs_count) {
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
        }
+        reiserfs_write_lock(sb);
        free_journal_ram(sb);
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
        /* read in the log blocks, memcpy to the corresponding real block */
        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
        for (i = 0; i < get_desc_trans_len(desc); i++) {
+                reiserfs_write_unlock(sb);
                wait_on_buffer(log_blocks[i]);
+                reiserfs_write_lock(sb);
                if (!buffer_uptodate(log_blocks[i])) {
                        reiserfs_warning(sb, "journal-1212",
                                         "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
+        /*
+         * We need to unlock here to avoid creating the following
+         * dependency:
+         * reiserfs_lock -> sysfs_mutex
+         * Because the reiserfs mmap path creates the following dependency:
+         * mm->mmap -> reiserfs_lock, hence we have
+         * mm->mmap -> reiserfs_lock ->sysfs_mutex
+         * This would ends up in a circular dependency with sysfs readdir path
+         * which does sysfs_mutex -> mm->mmap_sem
+         * This is fine because the reiserfs lock is useless in mount path,
+         * at least until we call journal_begin. We keep it for paranoid
+         * reasons.
+         */
+        reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+                reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
+        reiserfs_write_lock(sb);
        rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        }
        reiserfs_mounted_fs_count++;
-        if (reiserfs_mounted_fs_count <= 1)
+        if (reiserfs_mounted_fs_count <= 1) {
+                reiserfs_write_unlock(sb);
                commit_wq = create_workqueue("reiserfs");
+                reiserfs_write_lock(sb);
+        }
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&journal->j_join_wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+        if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+                reiserfs_write_unlock(s);
                schedule();
+                reiserfs_write_lock(s);
+        }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
        struct reiserfs_journal *journal = SB_JOURNAL(sb);
        unsigned long bcount = journal->j_bcount;
        while (1) {
+                reiserfs_write_unlock(sb);
                schedule_timeout_uninterruptible(1);
+                reiserfs_write_lock(sb);
                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
                while ((atomic_read(&journal->j_wcount) > 0 ||
                        atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
                unlock_journal(sb);
+                reiserfs_write_unlock(sb);
                reiserfs_wait_on_write_block(sb);
+                reiserfs_write_lock(sb);
                PROC_INFO_INC(sb, journal.journal_relock_writers);
                goto relock;
        }
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
        struct reiserfs_journal_list *jl;
        struct list_head *entry;
-        lock_kernel();
+        reiserfs_write_lock(sb);
        if (!list_empty(&journal->j_journal_list)) {
                /* last entry is the youngest, commit it and you get everything */
                entry = journal->j_journal_list.prev;
                jl = JOURNAL_LIST_ENTRY(entry);
                flush_commit_list(sb, jl, 1);
        }
-        unlock_kernel();
+        reiserfs_write_unlock(sb);
 }
 /*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                next = cn->next;
                free_cnode(sb, cn);
                cn = next;
+                reiserfs_write_unlock(sb);
                cond_resched();
+                reiserfs_write_lock(sb);
        }
        /* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * is lost.
         */
        if (!list_empty(&jl->j_tail_bh_list)) {
-                unlock_kernel();
+                reiserfs_write_unlock(sb);
                write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                      journal, jl, &jl->j_tail_bh_list);
-                lock_kernel();
+                reiserfs_write_lock(sb);
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
        mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..ee2cfc0fd8a7
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,88 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+        }
+        /* No need to protect it, only the current task touches it */
+        sb_i->lock_depth++;
+}
+void reiserfs_write_unlock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        /*
+         * Are we unlocking without even holding the lock?
+         * Such a situation must raise a BUG() if we don't want
+         * to corrupt the data.
+         */
+        BUG_ON(sb_i->lock_owner != current);
+        if (--sb_i->lock_depth == -1) {
+                sb_i->lock_owner = NULL;
+                mutex_unlock(&sb_i->lock);
+        }
+}
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+                return sb_i->lock_depth++;
+        }
+        return sb_i->lock_depth;
+}
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+        if (lock_depth == -1)
+                reiserfs_write_unlock(s);
+}
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+        if (sb_i->lock_depth < 0)
+                reiserfs_panic(sb, "%s called without kernel lock held %d",
+                               caller);
+}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 271579128634..e296ff72a6cc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nd)
 {
        int retval;
+        int lock_depth;
        struct inode *inode = NULL;
        struct reiserfs_dir_entry de;
        INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
        if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
                return ERR_PTR(-ENAMETOOLONG);
-        reiserfs_write_lock(dir->i_sb);
+        /*
+         * Might be called with or without the write lock, must be careful
+         * to not recursively hold it in case we want to release the lock
+         * before rescheduling.
+         */
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        de.de_gen_number_bit_string = NULL;
        retval =
            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                inode = reiserfs_iget(dir->i_sb,
                                      (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
-                        reiserfs_write_unlock(dir->i_sb);
+                        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
                        return ERR_PTR(-EACCES);
                }
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                if (IS_PRIVATE(dir))
                        inode->i_flags |= S_PRIVATE;
        }
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        if (retval == IO_ERROR) {
                return ERR_PTR(-EIO);
        }
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+        int lock_depth;
        /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return retval;
        }
        jbegin_count += retval;
-        reiserfs_write_lock(dir->i_sb);
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
+out_failed:
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        return retval;
 }
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb710..adbc6f538515 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
   .  */
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 void __reiserfs_panic(struct super_block *sb, const char *id,
                      const char *function, const char *fmt, ...)
 {
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(bh);
+                        reiserfs_write_lock(s);
                        // update bitmap_info stuff
                        bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
                        brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..5fa7118f04e1 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,	/* Key to search for. */
        return ITEM_NOT_FOUND;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 #define SEARCH_BY_KEY_READA 16
-/* The function is NOT SCHEDULE-SAFE! */
+/*
-static void search_by_key_reada(struct super_block *s,
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+        bool unlocked = false;
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
        }
+        /*
+         * We are going to read some blocks on which we
+         * have a reference. It's safe, though we might be
+         * reading blocks concurrently changed if we release
+         * the lock. But it's still fine because we check later
+         * if the tree changed
+         */
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-                if (!buffer_uptodate(bh[j]))
+                if (!buffer_uptodate(bh[j])) {
+                        if (!unlocked) {
+                                reiserfs_write_unlock(s);
+                                unlocked = true;
+                        }
                        ll_rw_block(READA, 1, bh + j);
+                }
                brelse(bh[j]);
        }
+        return unlocked;
 }
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                        bool unlocked = false;
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                                search_by_key_reada(sb, reada_bh,
+                                /* may unlock the write lock */
+                                unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
+                        /*
+                         * If we haven't already unlocked the write lock,
+                         * then we need to do that here before reading
+                         * the current block
+                         */
+                        if (!buffer_uptodate(bh) && !unlocked) {
+                                reiserfs_write_unlock(sb);
+                                unlocked = true;
+                        }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
+                        if (unlocked)
+                                reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                       !key_in_buffer(search_path, key, sb),
                       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-                if (cur_tb) {
+                if (REISERFS_SB(sb)->cur_tb) {
                        print_cur_tb("5140");
                        reiserfs_panic(sb, "PAP-5140",
                                       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
                        reiserfs_free_block(th, inode, block, 1);
                    }
+                    reiserfs_write_unlock(sb);
                    cond_resched();
+                    reiserfs_write_lock(sb);
                    if (item_moved (&s_ih, path))  {
                        need_re_search = 1;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f38022..339b0baf2af6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
-        lock_kernel();
+        reiserfs_write_lock(s);
        if (s->s_dirt)
                reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
        reiserfs_proc_info_done(s);
+        reiserfs_write_unlock(s);
+        mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
        struct reiserfs_transaction_handle th;
        int err = 0;
+        int lock_depth;
        if (inode->i_sb->s_flags & MS_RDONLY) {
                reiserfs_warning(inode->i_sb, "clm-6006",
                                 "writing inode %lu on readonly FS",
                                 inode->i_ino);
                return;
        }
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        /* this is really only used for atime updates, so they don't have
         ** to be included in O_SYNC or fsync
         */
        err = journal_begin(&th, inode->i_sb, 1);
-        if (err) {
+        if (err)
-                reiserfs_write_unlock(inode->i_sb);
+                goto out;
-                return;
-        }
        reiserfs_update_sd(&th, inode);
        journal_end(&th, inode->i_sb, 1);
-        reiserfs_write_unlock(inode->i_sb);
+out:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 #ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+#endif
+        reiserfs_write_lock(s);
+#ifdef CONFIG_QUOTA
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
-        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return 0;
 out_err:
        kfree(new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return err;
 }
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+        reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
+        reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
        if (!sbi) {
                errval = -ENOMEM;
-                goto error;
+                goto error_alloc;
        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
+        mutex_init(&REISERFS_SB(s)->lock);
+        REISERFS_SB(s)->lock_depth = -1;
+        /*
+         * This function is called with the bkl, which also was the old
+         * locking used here.
+         * do_journal_begin() will soon check if we hold the lock (ie: was the
+         * bkl). This is likely because do_journal_begin() has several another
+         * callers because at this time, it doesn't seem to be necessary to
+         * protect against anything.
+         * Anyway, let's be conservative and lock for now.
+         */
+        reiserfs_write_lock(s);
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        init_waitqueue_head(&(sbi->s_wait));
        spin_lock_init(&sbi->bitmap_lock);
+        reiserfs_write_unlock(s);
        return (0);
 error:
+        reiserfs_write_unlock(s);
+error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..58aa8e75f7f5 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
        /* If we don't have the privroot located yet - go find it */
-        mutex_lock(&s->s_root->d_inode->i_mutex);
+        reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-                mutex_lock(&s->s_root->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
                err = create_privroot(REISERFS_SB(s)->priv_root);
                mutex_unlock(&s->s_root->d_inode->i_mutex);
        }
        if (privroot->d_inode) {
                s->s_xattr = reiserfs_xattr_handlers;
-                mutex_lock(&privroot->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..39208663aaf1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
        ret = buf->ops->confirm(pipe, buf);
        if (!ret) {
                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+                if (file->f_op && file->f_op->sendpage)
-                ret = file->f_op->sendpage(file, buf->page, buf->offset,
+                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                           sd->len, &pos, more);
+                                                   sd->len, &pos, more);
+                else
+                        ret = -EINVAL;
        }
        return ret;
@@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        splice_write = out->f_op->splice_write;
+        if (out->f_op && out->f_op->splice_write)
-        if (!splice_write)
+                splice_write = out->f_op->splice_write;
+        else
                splice_write = default_file_splice_write;
        return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        splice_read = in->f_op->splice_read;
+        if (in->f_op && in->f_op->splice_read)
-        if (!splice_read)
+                splice_read = in->f_op->splice_read;
+        else
                splice_read = default_file_splice_read;
        return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
-                        if (out->f_op->llseek == no_llseek)
+                        if (!out->f_op || !out->f_op->llseek ||
+                            out->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
@@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
-                        if (in->f_op->llseek == no_llseek)
+                        if (!in->f_op || !in->f_op->llseek ||
+                            in->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd946..8a771c59ac3e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2014,7 +2014,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                inum = key_inum_flash(c, &dent->key);
                fscki1 = read_add_inode(c, priv, inum);
                if (IS_ERR(fscki1)) {
-                        err = PTR_ERR(fscki);
+                        err = PTR_ERR(fscki1);
                        ubifs_err("error %d while processing entry node and "
                                  "trying to find parent inode node %lu",
                                  err, (unsigned long)inum);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d602..39849f887e72 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        ssize_t ret;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                return err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (ret < 0)
-                return ret;
-        if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
-                err = ubifs_sync_wbufs_by_inode(c, inode);
-                if (err)
-                        return err;
-        }
-        return ret;
 }
 static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..868a55ee080f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
 /*
 * This file implements functions needed to recover from unclean un-mounts.
 * When UBIFS is mounted, it checks a flag on the master node to determine if
- * an un-mount was completed sucessfully. If not, the process of mounting
+ * an un-mount was completed successfully. If not, the process of mounting
 * incorparates additional checking and fixing of on-flash data structures.
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee987..943ad5624530 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1842,22 +1842,32 @@ const struct super_operations ubifs_super_operations = {
 * @name: UBI volume name
 * @mode: UBI volume open mode
 *
- * There are several ways to specify UBI volumes when mounting UBIFS:
+ * The primary method of mounting UBIFS is by specifying the UBI volume
- * o ubiX_Y    - UBI device number X, volume Y;
+ * character device node path. However, UBIFS may also be mounted withoug any
- * o ubiY      - UBI device number 0, volume Y;
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
 * o ubiX:NAME - mount UBI device X, volume with name NAME;
 * o ubi:NAME  - mount UBI device 0, volume with name NAME.
 *
 * Alternative '!' separator may be used instead of ':' (because some shells
 * like busybox may interpret ':' as an NFS host name separator). This function
- * returns ubi volume object in case of success and a negative error code in
+ * returns UBI volume description object in case of success and a negative
- * case of failure.
+ * error code in case of failure.
 */
 static struct ubi_volume_desc *open_ubi(const char *name, int mode)
 {
+        struct ubi_volume_desc *ubi;
        int dev, vol;
        char *endptr;
+        /* First, try to open using the device node path method */
+        ubi = ubi_open_volume_path(name, mode);
+        if (!IS_ERR(ubi))
+                return ubi;
+        /* Try the "nodev" method */
        if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
                return ERR_PTR(-EINVAL);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee9..05ac0fe9c4d3 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
        if (count == 0)
                return NULL;
        
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30eea74dc..70f989895d15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -904,16 +904,9 @@ xfs_convert_page(
        if (startio) {
                if (count) {
-                        struct backing_dev_info *bdi;
-                        bdi = inode->i_mapping->backing_dev_info;
                        wbc->nr_to_write--;
-                        if (bdi_write_congested(bdi)) {
+                        if (wbc->nr_to_write <= 0)
-                                wbc->encountered_congestion = 1;
-                                done = 1;
-                        } else if (wbc->nr_to_write <= 0) {
                                done = 1;
-                        }
                }
                xfs_start_page_writeback(page, !page_dirty, count);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index c5bc67c4e3bb..7bb5092d6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -55,170 +55,140 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.sgid_inherit.min,
                .extra2         = &xfs_params.sgid_inherit.max
        },
        {
-                .ctl_name       = XFS_SYMLINK_MODE,
                .procname       = "irix_symlink_mode",
                .data           = &xfs_params.symlink_mode.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.symlink_mode.min,
                .extra2         = &xfs_params.symlink_mode.max
        },
        {
-                .ctl_name       = XFS_PANIC_MASK,
                .procname       = "panic_mask",
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
        {
-                .ctl_name       = XFS_ERRLEVEL,
                .procname       = "error_level",
                .data           = &xfs_params.error_level.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.error_level.min,
                .extra2         = &xfs_params.error_level.max
        },
        {
-                .ctl_name       = XFS_SYNCD_TIMER,
                .procname       = "xfssyncd_centisecs",
                .data           = &xfs_params.syncd_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.syncd_timer.min,
                .extra2         = &xfs_params.syncd_timer.max
        },
        {
-                .ctl_name       = XFS_INHERIT_SYNC,
                .procname       = "inherit_sync",
                .data           = &xfs_params.inherit_sync.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_sync.min,
                .extra2         = &xfs_params.inherit_sync.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODUMP,
                .procname       = "inherit_nodump",
                .data           = &xfs_params.inherit_nodump.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodump.min,
                .extra2         = &xfs_params.inherit_nodump.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOATIME,
                .procname       = "inherit_noatime",
                .data           = &xfs_params.inherit_noatim.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_noatim.min,
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .ctl_name       = XFS_BUF_TIMER,
                .procname       = "xfsbufd_centisecs",
                .data           = &xfs_params.xfs_buf_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_timer.min,
                .extra2         = &xfs_params.xfs_buf_timer.max
        },
        {
-                .ctl_name       = XFS_BUF_AGE,
                .procname       = "age_buffer_centisecs",
                .data           = &xfs_params.xfs_buf_age.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_age.min,
                .extra2         = &xfs_params.xfs_buf_age.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOSYM,
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nosym.min,
                .extra2         = &xfs_params.inherit_nosym.max
        },
        {
-                .ctl_name       = XFS_ROTORSTEP,
                .procname       = "rotorstep",
                .data           = &xfs_params.rotorstep.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.rotorstep.min,
                .extra2         = &xfs_params.rotorstep.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODFRG,
                .procname       = "inherit_nodefrag",
                .data           = &xfs_params.inherit_nodfrg.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodfrg.min,
                .extra2         = &xfs_params.inherit_nodfrg.max
        },
        {
-                .ctl_name       = XFS_FILESTREAM_TIMER,
                .procname       = "filestream_centisecs",
                .data           = &xfs_params.fstrm_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.fstrm_timer.min,
                .extra2         = &xfs_params.fstrm_timer.max,
        },
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {
-                .ctl_name       = XFS_STATS_CLEAR,
                .procname       = "stats_clear",
                .data           = &xfs_params.stats_clear.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &xfs_stats_clear_proc_handler,
+                .proc_handler   = xfs_stats_clear_proc_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.stats_clear.min,
                .extra2         = &xfs_params.stats_clear.max
        },
@@ -229,7 +199,6 @@ static ctl_table xfs_table[] = {
 static ctl_table xfs_dir_table[] = {
        {
-                .ctl_name       = FS_XFS,
                .procname       = "xfs",
                .mode           = 0555,
                .child          = xfs_table
@@ -239,7 +208,6 @@ static ctl_table xfs_dir_table[] = {
 static ctl_table xfs_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = xfs_dir_table
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a2c16bcee90b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -98,7 +98,7 @@ typedef struct xfs_dquot {
 #define dq_flags        q_lists.dqm_flags
 /*
- * Lock hierachy for q_qlock:
+ * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
 *      XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
 */