182 files changed, 4772 insertions, 3948 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c9340573..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 #include <linux/uio.h>
 #define DEBUG 0
@@ -32,6 +33,9 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
+#define AIO_BATCH_HASH_BITS     3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE     (1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+        struct hlist_node list;
+        struct address_space *mapping;
+};
+mempool_t *abe_pool;
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
@@ -73,6 +85,8 @@ static int __init aio_setup(void)
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        aio_wq = create_workqueue("aio");
+        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+        BUG_ON(!abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
        return 1;
 }
+static void aio_batch_add(struct address_space *mapping,
+                          struct hlist_head *batch_hash)
+{
+        struct aio_batch_entry *abe;
+        struct hlist_node *pos;
+        unsigned bucket;
+        bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+        hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+                if (abe->mapping == mapping)
+                        return;
+        }
+        abe = mempool_alloc(abe_pool, GFP_KERNEL);
+        BUG_ON(!igrab(mapping->host));
+        abe->mapping = mapping;
+        hlist_add_head(&abe->list, &batch_hash[bucket]);
+        return;
+}
+static void aio_batch_free(struct hlist_head *batch_hash)
+{
+        struct aio_batch_entry *abe;
+        struct hlist_node *pos, *n;
+        int i;
+        for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+                hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+                        blk_run_address_space(abe->mapping);
+                        iput(abe->mapping->host);
+                        hlist_del(&abe->list);
+                        mempool_free(abe, abe_pool);
+                }
+        }
+}
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb)
+                         struct iocb *iocb, struct hlist_head *batch_hash)
 {
        struct kiocb *req;
        struct file *file;
@@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        ;
        }
        spin_unlock_irq(&ctx->ctx_lock);
+        if (req->ki_opcode == IOCB_CMD_PREAD ||
+            req->ki_opcode == IOCB_CMD_PREADV ||
+            req->ki_opcode == IOCB_CMD_PWRITE ||
+            req->ki_opcode == IOCB_CMD_PWRITEV)
+                aio_batch_add(file->f_mapping, batch_hash);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        struct kioctx *ctx;
        long ret = 0;
        int i;
+        struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
                if (ret)
                        break;
        }
+        aio_batch_free(batch_hash);
        put_ioctx(ctx);
        return i ? i : ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..d15ea1790bfb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -767,7 +767,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        
        current->mm->start_stack = bprm->p;
-        /* Now we do a little grungy work by mmaping the ELF image into
+        /* Now we do a little grungy work by mmapping the ELF image into
           the correct location in memory. */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..76e6713abf94 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(bio_init);
 *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
 *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   Note that the caller must set ->bi_destructor on succesful return
+ *   Note that the caller must set ->bi_destructor on successful return
 *   of a bio, to do the appropriate freeing of the bio once the reference
 *   count drops to zero.
 **/
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
        }
 }
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+        int i;
+        struct bio_vec *bvec;
+        bio_for_each_segment(bvec, bi, i)
+                flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
 /**
 * bio_endio - end I/O on a bio
 * @bio:        bio
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8bed0557d88c..73d6a735b8f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 
 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        return sync_blockdev(I_BDEV(filp->f_mapping->host));
+        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        int error;
+        error = sync_blockdev(bdev);
+        if (error)
+                return error;
+        
+        error = blkdev_issue_flush(bdev, NULL);
+        if (error == -EOPNOTSUPP)
+                error = 0;
+        return error;
 }
 /*
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ccbdcb54ec5d..46bea0f4dc7b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -256,7 +256,7 @@ out:
 * Insert @em into @tree or perform a simple forward/backward merge with
 * existing mappings.  The extent_map struct passed in will be inserted
 * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was sucessfull.
+ * reference dropped if the merge attempt was successfull.
 */
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
                source name to use to represent the client netbios machine 
                name when doing the RFC1001 netbios session initialize.
  direct        Do not do inode data caching on files opened on this mount.
-                This precludes mmaping files on this mount. In some cases
+                This precludes mmapping files on this mount. In some cases
                with fast networks and little or no caching benefits on the
                client (e.g. when the application is doing large sequential
                reads bigger than page size without rereading the same data) 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..4b35f7ec0583 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -39,7 +39,7 @@
 /*
 * MAX_REQ is the maximum number of requests that WE will send
- * on one socket concurently. It also matches the most common
+ * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
 * eventually want to use the negotiated value (in case
 * future servers can handle more) when we are more confident that
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a842..3877737f96a6 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
 /* empty wct response to setattr */
 /*******************************************************/
-/* NT Transact structure defintions follow             */
+/* NT Transact structure definitions follow            */
 /* Currently only ioctl, acl (get security descriptor) */
 /* and notify are implemented                          */
 /*******************************************************/
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..cf18ee765590 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -914,8 +914,8 @@ undo_setattr:
 /*
 * If dentry->d_inode is null (usually meaning the cached dentry
 * is a negative dentry) then we would attempt a standard SMB delete, but
- * if that fails we can not attempt the fall back mechanisms on EACESS
+ * if that fails we can not attempt the fall back mechanisms on EACCESS
- * but will return the EACESS to the caller.  Note that the VFS does not call
+ * but will return the EACCESS to the caller. Note that the VFS does not call
 * unlink on negative dentries currently.
 */
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
        smbhash(p24 + 16, c8, p21 + 14, 1);
 }
-#if 0 /* currently unsued */
+#if 0 /* currently unused */
 static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 43c96ce29614..c6405ce3c50e 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -17,28 +17,25 @@ static struct ctl_table_header *fs_table_header;
 static ctl_table coda_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "timeout",
                .data           = &coda_timeout,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hard",
                .data           = &coda_hard,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "fake_statfs",
                .data           = &coda_fake_statfs,
                .maxlen         = sizeof(int),
                .mode           = 0600,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -46,7 +43,6 @@ static ctl_table coda_table[] = {
 #ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "coda",
                .mode           = 0555,
                .child          = coda_table
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e7058c298..2346895b3a77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -246,428 +246,6 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned
        return err;
 }
-#ifdef CONFIG_NET
-static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timeval __user *up = compat_ptr(arg);
-        struct timeval ktv;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&ktv);
-        set_fs(old_fs);
-        if(!err) {
-                err = put_user(ktv.tv_sec, &up->tv_sec);
-                err |= __put_user(ktv.tv_usec, &up->tv_usec);
-        }
-        return err;
-}
-static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct compat_timespec __user *up = compat_ptr(arg);
-        struct timespec kts;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&kts);
-        set_fs(old_fs);
-        if (!err) {
-                err = put_user(kts.tv_sec, &up->tv_sec);
-                err |= __put_user(kts.tv_nsec, &up->tv_nsec);
-        }
-        return err;
-}
-struct ifmap32 {
-        compat_ulong_t mem_start;
-        compat_ulong_t mem_end;
-        unsigned short base_addr;
-        unsigned char irq;
-        unsigned char dma;
-        unsigned char port;
-};
-struct ifreq32 {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                compat_int_t     ifru_ivalue;
-                compat_int_t     ifru_mtu;
-                struct  ifmap32 ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
-                char    ifru_newname[IFNAMSIZ];
-                compat_caddr_t ifru_data;
-            /* XXXX? ifru_settings should be here */
-        } ifr_ifru;
-};
-struct ifconf32 {
-        compat_int_t    ifc_len;                        /* size of buffer       */
-        compat_caddr_t  ifcbuf;
-};
-static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *uifr;
-        int err;
-        uifr = compat_alloc_user_space(sizeof(struct ifreq));
-        if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
-                return -EFAULT;
-        err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
-        if (err)
-                return err;
-        if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
-                return -EFAULT;
-        return 0;
-}
-static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifconf32 ifc32;
-        struct ifconf ifc;
-        struct ifconf __user *uifc;
-        struct ifreq32 __user *ifr32;
-        struct ifreq __user *ifr;
-        unsigned int i, j;
-        int err;
-        if (copy_from_user(&ifc32, compat_ptr(arg), sizeof(struct ifconf32)))
-                return -EFAULT;
-        if (ifc32.ifcbuf == 0) {
-                ifc32.ifc_len = 0;
-                ifc.ifc_len = 0;
-                ifc.ifc_req = NULL;
-                uifc = compat_alloc_user_space(sizeof(struct ifconf));
-        } else {
-                size_t len =((ifc32.ifc_len / sizeof (struct ifreq32)) + 1) *
-                        sizeof (struct ifreq);
-                uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
-                ifc.ifc_len = len;
-                ifr = ifc.ifc_req = (void __user *)(uifc + 1);
-                ifr32 = compat_ptr(ifc32.ifcbuf);
-                for (i = 0; i < ifc32.ifc_len; i += sizeof (struct ifreq32)) {
-                        if (copy_in_user(ifr, ifr32, sizeof(struct ifreq32)))
-                                return -EFAULT;
-                        ifr++;
-                        ifr32++; 
-                }
-        }
-        if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
-                return -EFAULT;
-        err = sys_ioctl (fd, SIOCGIFCONF, (unsigned long)uifc); 
-        if (err)
-                return err;
-        if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) 
-                return -EFAULT;
-        ifr = ifc.ifc_req;
-        ifr32 = compat_ptr(ifc32.ifcbuf);
-        for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
-             i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
-                if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
-                        return -EFAULT;
-                ifr32++;
-                ifr++;
-        }
-        if (ifc32.ifcbuf == 0) {
-                /* Translate from 64-bit structure multiple to
-                 * a 32-bit one.
-                 */
-                i = ifc.ifc_len;
-                i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
-                ifc32.ifc_len = i;
-        } else {
-                ifc32.ifc_len = i;
-        }
-        if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
-                return -EFAULT;
-        return 0;
-}
-static int ethtool_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *ifr;
-        struct ifreq32 __user *ifr32;
-        u32 data;
-        void __user *datap;
-        
-        ifr = compat_alloc_user_space(sizeof(*ifr));
-        ifr32 = compat_ptr(arg);
-        if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                return -EFAULT;
-        if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &ifr->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) ifr);
-}
-static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq kifr;
-        struct ifreq __user *uifr;
-        struct ifreq32 __user *ifr32 = compat_ptr(arg);
-        mm_segment_t old_fs;
-        int err;
-        u32 data;
-        void __user *datap;
-        switch (cmd) {
-        case SIOCBONDENSLAVE:
-        case SIOCBONDRELEASE:
-        case SIOCBONDSETHWADDR:
-        case SIOCBONDCHANGEACTIVE:
-                if (copy_from_user(&kifr, ifr32, sizeof(struct ifreq32)))
-                        return -EFAULT;
-                old_fs = get_fs();
-                set_fs (KERNEL_DS);
-                err = sys_ioctl (fd, cmd, (unsigned long)&kifr);
-                set_fs (old_fs);
-                return err;
-        case SIOCBONDSLAVEINFOQUERY:
-        case SIOCBONDINFOQUERY:
-                uifr = compat_alloc_user_space(sizeof(*uifr));
-                if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
-                        return -EFAULT;
-                if (get_user(data, &ifr32->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                datap = compat_ptr(data);
-                if (put_user(datap, &uifr->ifr_ifru.ifru_data))
-                        return -EFAULT;
-                return sys_ioctl (fd, cmd, (unsigned long)uifr);
-        default:
-                return -EINVAL;
-        };
-}
-static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq __user *u_ifreq64;
-        struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
-        char tmp_buf[IFNAMSIZ];
-        void __user *data64;
-        u32 data32;
-        if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
-                           IFNAMSIZ))
-                return -EFAULT;
-        if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
-                return -EFAULT;
-        data64 = compat_ptr(data32);
-        u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
-        /* Don't check these user accesses, just let that get trapped
-         * in the ioctl handler instead.
-         */
-        if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
-                         IFNAMSIZ))
-                return -EFAULT;
-        if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
-                return -EFAULT;
-        return sys_ioctl(fd, cmd, (unsigned long) u_ifreq64);
-}
-static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct ifreq ifr;
-        struct ifreq32 __user *uifr32;
-        struct ifmap32 __user *uifmap32;
-        mm_segment_t old_fs;
-        int err;
-        
-        uifr32 = compat_ptr(arg);
-        uifmap32 = &uifr32->ifr_ifru.ifru_map;
-        switch (cmd) {
-        case SIOCSIFMAP:
-                err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
-                err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
-                err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
-                err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
-                if (err)
-                        return -EFAULT;
-                break;
-        case SIOCSHWTSTAMP:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
-                break;
-        default:
-                if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
-                        return -EFAULT;
-                break;
-        }
-        old_fs = get_fs();
-        set_fs (KERNEL_DS);
-        err = sys_ioctl (fd, cmd, (unsigned long)&ifr);
-        set_fs (old_fs);
-        if (!err) {
-                switch (cmd) {
-                /* TUNSETIFF is defined as _IOW, it should be _IORW
-                 * as the data is copied back to user space, but that
-                 * cannot be fixed without breaking all existing apps.
-                 */
-                case TUNSETIFF:
-                case TUNGETIFF:
-                case SIOCGIFFLAGS:
-                case SIOCGIFMETRIC:
-                case SIOCGIFMTU:
-                case SIOCGIFMEM:
-                case SIOCGIFHWADDR:
-                case SIOCGIFINDEX:
-                case SIOCGIFADDR:
-                case SIOCGIFBRDADDR:
-                case SIOCGIFDSTADDR:
-                case SIOCGIFNETMASK:
-                case SIOCGIFTXQLEN:
-                        if (copy_to_user(uifr32, &ifr, sizeof(*uifr32)))
-                                return -EFAULT;
-                        break;
-                case SIOCGIFMAP:
-                        err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-                        err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-                        err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-                        err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-                        err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
-                        err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
-                        err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
-                        if (err)
-                                err = -EFAULT;
-                        break;
-                }
-        }
-        return err;
-}
-struct rtentry32 {
-        u32             rt_pad1;
-        struct sockaddr rt_dst;         /* target address               */
-        struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-        struct sockaddr rt_genmask;     /* target network mask (IP)     */
-        unsigned short  rt_flags;
-        short           rt_pad2;
-        u32             rt_pad3;
-        unsigned char   rt_tos;
-        unsigned char   rt_class;
-        short           rt_pad4;
-        short           rt_metric;      /* +1 for binary compatibility! */
-        /* char * */ u32 rt_dev;        /* forcing the device at add    */
-        u32             rt_mtu;         /* per route MTU/Window         */
-        u32             rt_window;      /* Window clamping              */
-        unsigned short  rt_irtt;        /* Initial RTT                  */
-};
-struct in6_rtmsg32 {
-        struct in6_addr         rtmsg_dst;
-        struct in6_addr         rtmsg_src;
-        struct in6_addr         rtmsg_gateway;
-        u32                     rtmsg_type;
-        u16                     rtmsg_dst_len;
-        u16                     rtmsg_src_len;
-        u32                     rtmsg_metric;
-        u32                     rtmsg_info;
-        u32                     rtmsg_flags;
-        s32                     rtmsg_ifindex;
-};
-static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        int ret;
-        void *r = NULL;
-        struct in6_rtmsg r6;
-        struct rtentry r4;
-        char devname[16];
-        u32 rtdev;
-        mm_segment_t old_fs = get_fs();
-        
-        struct socket *mysock = sockfd_lookup(fd, &ret);
-        if (mysock && mysock->sk && mysock->sk->sk_family == AF_INET6) { /* ipv6 */
-                struct in6_rtmsg32 __user *ur6 = compat_ptr(arg);
-                ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
-                        3 * sizeof(struct in6_addr));
-                ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
-                ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-                ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-                ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
-                ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
-                ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
-                ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
-                
-                r = (void *) &r6;
-        } else { /* ipv4 */
-                struct rtentry32 __user *ur4 = compat_ptr(arg);
-                ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
-                                        3 * sizeof(struct sockaddr));
-                ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
-                ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
-                ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
-                ret |= __get_user (r4.rt_window, &(ur4->rt_window));
-                ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
-                ret |= __get_user (rtdev, &(ur4->rt_dev));
-                if (rtdev) {
-                        ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
-                        r4.rt_dev = devname; devname[15] = 0;
-                } else
-                        r4.rt_dev = NULL;
-                r = (void *) &r4;
-        }
-        if (ret) {
-                ret = -EFAULT;
-                goto out;
-        }
-        set_fs (KERNEL_DS);
-        ret = sys_ioctl (fd, cmd, (unsigned long) r);
-        set_fs (old_fs);
-out:
-        if (mysock)
-                sockfd_put(mysock);
-        return ret;
-}
-#endif
 #ifdef CONFIG_BLOCK
 typedef struct sg_io_hdr32 {
        compat_int_t interface_id;      /* [i] 'S' for SCSI generic (required) */
@@ -1212,170 +790,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a
        return err;
 }
-struct atmif_sioc32 {
-        compat_int_t    number;
-        compat_int_t    length;
-        compat_caddr_t  arg;
-};
-struct atm_iobuf32 {
-        compat_int_t    length;
-        compat_caddr_t  buffer;
-};
-#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct atmif_sioc32)
-#define ATM_GETNAMES32    _IOW('a', ATMIOC_ITF+3, struct atm_iobuf32)
-#define ATM_GETTYPE32     _IOW('a', ATMIOC_ITF+4, struct atmif_sioc32)
-#define ATM_GETESI32      _IOW('a', ATMIOC_ITF+5, struct atmif_sioc32)
-#define ATM_GETADDR32     _IOW('a', ATMIOC_ITF+6, struct atmif_sioc32)
-#define ATM_RSTADDR32     _IOW('a', ATMIOC_ITF+7, struct atmif_sioc32)
-#define ATM_ADDADDR32     _IOW('a', ATMIOC_ITF+8, struct atmif_sioc32)
-#define ATM_DELADDR32     _IOW('a', ATMIOC_ITF+9, struct atmif_sioc32)
-#define ATM_GETCIRANGE32  _IOW('a', ATMIOC_ITF+10, struct atmif_sioc32)
-#define ATM_SETCIRANGE32  _IOW('a', ATMIOC_ITF+11, struct atmif_sioc32)
-#define ATM_SETESI32      _IOW('a', ATMIOC_ITF+12, struct atmif_sioc32)
-#define ATM_SETESIF32     _IOW('a', ATMIOC_ITF+13, struct atmif_sioc32)
-#define ATM_GETSTAT32     _IOW('a', ATMIOC_SARCOM+0, struct atmif_sioc32)
-#define ATM_GETSTATZ32    _IOW('a', ATMIOC_SARCOM+1, struct atmif_sioc32)
-#define ATM_GETLOOP32     _IOW('a', ATMIOC_SARCOM+2, struct atmif_sioc32)
-#define ATM_SETLOOP32     _IOW('a', ATMIOC_SARCOM+3, struct atmif_sioc32)
-#define ATM_QUERYLOOP32   _IOW('a', ATMIOC_SARCOM+4, struct atmif_sioc32)
-static struct {
-        unsigned int cmd32;
-        unsigned int cmd;
-} atm_ioctl_map[] = {
-        { ATM_GETLINKRATE32, ATM_GETLINKRATE },
-        { ATM_GETNAMES32,    ATM_GETNAMES },
-        { ATM_GETTYPE32,     ATM_GETTYPE },
-        { ATM_GETESI32,      ATM_GETESI },
-        { ATM_GETADDR32,     ATM_GETADDR },
-        { ATM_RSTADDR32,     ATM_RSTADDR },
-        { ATM_ADDADDR32,     ATM_ADDADDR },
-        { ATM_DELADDR32,     ATM_DELADDR },
-        { ATM_GETCIRANGE32,  ATM_GETCIRANGE },
-        { ATM_SETCIRANGE32,  ATM_SETCIRANGE },
-        { ATM_SETESI32,      ATM_SETESI },
-        { ATM_SETESIF32,     ATM_SETESIF },
-        { ATM_GETSTAT32,     ATM_GETSTAT },
-        { ATM_GETSTATZ32,    ATM_GETSTATZ },
-        { ATM_GETLOOP32,     ATM_GETLOOP },
-        { ATM_SETLOOP32,     ATM_SETLOOP },
-        { ATM_QUERYLOOP32,   ATM_QUERYLOOP }
-};
-#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
-static int do_atm_iobuf(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atm_iobuf   __user *iobuf;
-        struct atm_iobuf32 __user *iobuf32;
-        u32 data;
-        void __user *datap;
-        int len, err;
-        iobuf = compat_alloc_user_space(sizeof(*iobuf));
-        iobuf32 = compat_ptr(arg);
-        if (get_user(len, &iobuf32->length) ||
-            get_user(data, &iobuf32->buffer))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(len, &iobuf->length) ||
-            put_user(datap, &iobuf->buffer))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long)iobuf);
-        if (!err) {
-                if (copy_in_user(&iobuf32->length, &iobuf->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
-        return err;
-}
-static int do_atmif_sioc(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct atmif_sioc   __user *sioc;
-        struct atmif_sioc32 __user *sioc32;
-        u32 data;
-        void __user *datap;
-        int err;
-        
-        sioc = compat_alloc_user_space(sizeof(*sioc));
-        sioc32 = compat_ptr(arg);
-        if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
-            get_user(data, &sioc32->arg))
-                return -EFAULT;
-        datap = compat_ptr(data);
-        if (put_user(datap, &sioc->arg))
-                return -EFAULT;
-        err = sys_ioctl(fd, cmd, (unsigned long) sioc);
-        if (!err) {
-                if (copy_in_user(&sioc32->length, &sioc->length,
-                                 sizeof(int)))
-                        err = -EFAULT;
-        }
-        return err;
-}
-static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
-{
-        int i;
-        unsigned int cmd = 0;
-        
-        switch (cmd32) {
-        case SONET_GETSTAT:
-        case SONET_GETSTATZ:
-        case SONET_GETDIAG:
-        case SONET_SETDIAG:
-        case SONET_CLRDIAG:
-        case SONET_SETFRAMING:
-        case SONET_GETFRAMING:
-        case SONET_GETFRSENSE:
-                return do_atmif_sioc(fd, cmd32, arg);
-        }
-        for (i = 0; i < NR_ATM_IOCTL; i++) {
-                if (cmd32 == atm_ioctl_map[i].cmd32) {
-                        cmd = atm_ioctl_map[i].cmd;
-                        break;
-                }
-        }
-        if (i == NR_ATM_IOCTL)
-                return -EINVAL;
-        
-        switch (cmd) {
-        case ATM_GETNAMES:
-                return do_atm_iobuf(fd, cmd, arg);
-            
-        case ATM_GETLINKRATE:
-        case ATM_GETTYPE:
-        case ATM_GETESI:
-        case ATM_GETADDR:
-        case ATM_RSTADDR:
-        case ATM_ADDADDR:
-        case ATM_DELADDR:
-        case ATM_GETCIRANGE:
-        case ATM_SETCIRANGE:
-        case ATM_SETESI:
-        case ATM_SETESIF:
-        case ATM_GETSTAT:
-        case ATM_GETSTATZ:
-        case ATM_GETLOOP:
-        case ATM_SETLOOP:
-        case ATM_QUERYLOOP:
-                return do_atmif_sioc(fd, cmd, arg);
-        }
-        return -EINVAL;
-}
 static __used int
 ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
@@ -1718,21 +1132,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
        return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
-/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
- * for some operations; this forces use of the newer bridge-utils that
- * use compatible ioctls
- */
-static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        u32 tmp;
-        if (get_user(tmp, (u32 __user *) arg))
-                return -EFAULT;
-        if (tmp == BRCTL_GET_VERSION)
-                return BRCTL_VERSION + 1;
-        return -EINVAL;
-}
 #define RTC_IRQP_READ32         _IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32          _IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32        _IOR('p', 0x0d, compat_ulong_t)
@@ -1979,18 +1378,6 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
 #endif
-/* Big T */
-COMPATIBLE_IOCTL(TUNSETNOCSUM)
-COMPATIBLE_IOCTL(TUNSETDEBUG)
-COMPATIBLE_IOCTL(TUNSETPERSIST)
-COMPATIBLE_IOCTL(TUNSETOWNER)
-COMPATIBLE_IOCTL(TUNSETLINK)
-COMPATIBLE_IOCTL(TUNSETGROUP)
-COMPATIBLE_IOCTL(TUNGETFEATURES)
-COMPATIBLE_IOCTL(TUNSETOFFLOAD)
-COMPATIBLE_IOCTL(TUNSETTXFILTER)
-COMPATIBLE_IOCTL(TUNGETSNDBUF)
-COMPATIBLE_IOCTL(TUNSETSNDBUF)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2032,30 +1419,6 @@ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
 COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
-COMPATIBLE_IOCTL(FIOSETOWN)
-COMPATIBLE_IOCTL(SIOCSPGRP)
-COMPATIBLE_IOCTL(FIOGETOWN)
-COMPATIBLE_IOCTL(SIOCGPGRP)
-COMPATIBLE_IOCTL(SIOCATMARK)
-COMPATIBLE_IOCTL(SIOCSIFLINK)
-COMPATIBLE_IOCTL(SIOCSIFENCAP)
-COMPATIBLE_IOCTL(SIOCGIFENCAP)
-COMPATIBLE_IOCTL(SIOCSIFNAME)
-COMPATIBLE_IOCTL(SIOCSARP)
-COMPATIBLE_IOCTL(SIOCGARP)
-COMPATIBLE_IOCTL(SIOCDARP)
-COMPATIBLE_IOCTL(SIOCSRARP)
-COMPATIBLE_IOCTL(SIOCGRARP)
-COMPATIBLE_IOCTL(SIOCDRARP)
-COMPATIBLE_IOCTL(SIOCADDDLCI)
-COMPATIBLE_IOCTL(SIOCDELDLCI)
-COMPATIBLE_IOCTL(SIOCGMIIPHY)
-COMPATIBLE_IOCTL(SIOCGMIIREG)
-COMPATIBLE_IOCTL(SIOCSMIIREG)
-COMPATIBLE_IOCTL(SIOCGIFVLAN)
-COMPATIBLE_IOCTL(SIOCSIFVLAN)
-COMPATIBLE_IOCTL(SIOCBRADDBR)
-COMPATIBLE_IOCTL(SIOCBRDELBR)
 #ifdef CONFIG_BLOCK
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
@@ -2311,22 +1674,6 @@ COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
 /* SMB ioctls which do not need any translations */
 COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
-/* Little a */
-COMPATIBLE_IOCTL(ATMSIGD_CTRL)
-COMPATIBLE_IOCTL(ATMARPD_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_CTRL)
-COMPATIBLE_IOCTL(ATMLEC_MCAST)
-COMPATIBLE_IOCTL(ATMLEC_DATA)
-COMPATIBLE_IOCTL(ATM_SETSC)
-COMPATIBLE_IOCTL(SIOCSIFATMTCP)
-COMPATIBLE_IOCTL(SIOCMKCLIP)
-COMPATIBLE_IOCTL(ATMARP_MKIP)
-COMPATIBLE_IOCTL(ATMARP_SETENTRY)
-COMPATIBLE_IOCTL(ATMARP_ENCAP)
-COMPATIBLE_IOCTL(ATMTCP_CREATE)
-COMPATIBLE_IOCTL(ATMTCP_REMOVE)
-COMPATIBLE_IOCTL(ATMMPC_CTRL)
-COMPATIBLE_IOCTL(ATMMPC_DATA)
 /* Watchdog */
 COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
@@ -2532,63 +1879,6 @@ COMPATIBLE_IOCTL(JSIOCGBUTTONS)
 COMPATIBLE_IOCTL(JSIOCGNAME(0))
 /* now things that need handlers */
-#ifdef CONFIG_NET
-HANDLE_IOCTL(SIOCGIFNAME, dev_ifname32)
-HANDLE_IOCTL(SIOCGIFCONF, dev_ifconf)
-HANDLE_IOCTL(SIOCGIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMETRIC, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMTU, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMEM, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCADDMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCDELMULTI, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFINDEX, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
-HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
-/* ioctls used by appletalk ddp.c */
-HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCDIFADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCDARP, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFBRDADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFDSTADDR, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFNETMASK, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
-HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
-HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
-HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
-HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSETHWADDR, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDSLAVEINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDINFOQUERY, bond_ioctl)
-HANDLE_IOCTL(SIOCBONDCHANGEACTIVE, bond_ioctl)
-HANDLE_IOCTL(SIOCADDRT, routing_ioctl)
-HANDLE_IOCTL(SIOCDELRT, routing_ioctl)
-HANDLE_IOCTL(SIOCBRADDIF, dev_ifsioc)
-HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
-/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
-HANDLE_IOCTL(SIOCRTMSG, ret_einval)
-HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
-HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
-#endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
@@ -2613,31 +1903,6 @@ HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
 /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
-HANDLE_IOCTL(ATM_GETLINKRATE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETNAMES32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETTYPE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_RSTADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_ADDADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_DELADDR32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETCIRANGE32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESI32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETESIF32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTAT32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETSTATZ32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_GETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_SETLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(ATM_QUERYLOOP32, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTAT, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETSTATZ, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_CLRDIAG, do_atm_ioctl)
-HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
-HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
 /* loop */
@@ -2655,7 +1920,7 @@ COMPATIBLE_IOCTL(TIOCSLTC)
 #endif
 #ifdef TIOCSTART
 /*
- * For these two we have defintions in ioctls.h and/or termios.h on
+ * For these two we have definitions in ioctls.h and/or termios.h on
 * some architectures but no actual implemention.  Some applications
 * like bash call them if they are defined in the headers, so we provide
 * entries here to avoid syslog message spew.
@@ -2672,11 +1937,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
 HANDLE_IOCTL(I2C_FUNCS, w_long)
 HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
 HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* bridge */
-HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
-HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
 /* Not implemented in the native kernel */
-IGNORE_IOCTL(SIOCGIFCOUNT)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2831,12 +2092,6 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                        goto found_handler;
        }
-#ifdef CONFIG_NET
-        if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
-            cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-                error = siocdevprivate_ioctl(fd, cmd, arg);
-        } else
-#endif
        {
                static int count;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..b486169f42bf 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+                                       void *data, const struct file_operations *fops)
 {
        struct inode *inode = new_inode(sb);
@@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
-                        inode->i_fop = &debugfs_file_operations;
+                        inode->i_fop = fops ? fops : &debugfs_file_operations;
+                        inode->i_private = data;
                        break;
                case S_IFLNK:
                        inode->i_op = &debugfs_link_operations;
+                        inode->i_fop = fops;
+                        inode->i_private = data;
                        break;
                case S_IFDIR:
                        inode->i_op = &simple_dir_inode_operations;
-                        inode->i_fop = &simple_dir_operations;
+                        inode->i_fop = fops ? fops : &simple_dir_operations;
+                        inode->i_private = data;
                        /* directory inodes start off with i_nlink == 2
                         * (for "." entry) */
@@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 /* SMP-safe */
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-                         int mode, dev_t dev)
+                         int mode, dev_t dev, void *data,
+                         const struct file_operations *fops)
 {
        struct inode *inode;
        int error = -EPERM;
@@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
        if (dentry->d_inode)
                return -EEXIST;
-        inode = debugfs_get_inode(dir->i_sb, mode, dev);
+        inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
        if (inode) {
                d_instantiate(dentry, inode);
                dget(dentry);
@@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
        return error;
 }
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+                         void *data, const struct file_operations *fops)
 {
        int res;
        mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-        res = debugfs_mknod(dir, dentry, mode, 0);
+        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
        if (!res) {
                inc_nlink(dir);
                fsnotify_mkdir(dir, dentry);
@@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return res;
 }
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+                        void *data, const struct file_operations *fops)
 {
        mode = (mode & S_IALLUGO) | S_IFLNK;
-        return debugfs_mknod(dir, dentry, mode, 0);
+        return debugfs_mknod(dir, dentry, mode, 0, data, fops);
 }
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                          void *data, const struct file_operations *fops)
 {
        int res;
        mode = (mode & S_IALLUGO) | S_IFREG;
-        res = debugfs_mknod(dir, dentry, mode, 0);
+        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
        if (!res)
                fsnotify_create(dir, dentry);
        return res;
@@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = {
 static int debugfs_create_by_name(const char *name, mode_t mode,
                                  struct dentry *parent,
-                                  struct dentry **dentry)
+                                  struct dentry **dentry,
+                                  void *data,
+                                  const struct file_operations *fops)
 {
        int error = 0;
@@ -164,13 +176,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
        if (!IS_ERR(*dentry)) {
                switch (mode & S_IFMT) {
                case S_IFDIR:
-                        error = debugfs_mkdir(parent->d_inode, *dentry, mode);
+                        error = debugfs_mkdir(parent->d_inode, *dentry, mode,
+                                              data, fops);
                        break;
                case S_IFLNK:
-                        error = debugfs_link(parent->d_inode, *dentry, mode);
+                        error = debugfs_link(parent->d_inode, *dentry, mode,
+                                             data, fops);
                        break;
                default:
-                        error = debugfs_create(parent->d_inode, *dentry, mode);
+                        error = debugfs_create(parent->d_inode, *dentry, mode,
+                                               data, fops);
                        break;
                }
                dput(*dentry);
@@ -184,7 +199,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 /**
 * debugfs_create_file - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have
+ * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this paramater is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
@@ -195,8 +210,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 *        this file.
 *
 * This is the basic "create a file" function for debugfs.  It allows for a
- * wide range of flexibility in createing a file, or a directory (if you
+ * wide range of flexibility in creating a file, or a directory (if you want
- * want to create a directory, the debugfs_create_dir() function is
+ * to create a directory, the debugfs_create_dir() function is
 * recommended to be used instead.)
 *
 * This function will return a pointer to a dentry if it succeeds.  This
@@ -221,19 +236,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
        if (error)
                goto exit;
-        error = debugfs_create_by_name(name, mode, parent, &dentry);
+        error = debugfs_create_by_name(name, mode, parent, &dentry,
+                                       data, fops);
        if (error) {
                dentry = NULL;
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
                goto exit;
        }
-        if (dentry->d_inode) {
-                if (data)
-                        dentry->d_inode->i_private = data;
-                if (fops)
-                        dentry->d_inode->i_fop = fops;
-        }
 exit:
        return dentry;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..b912270942fa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        if (dio->bio)
                dio_bio_submit(dio);
-        /* All IO is now issued, send it on its way */
-        blk_run_address_space(inode->i_mapping);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
@@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
            ((rw & READ) || (dio->result == dio->size)))
                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED)
+        if (ret != -EIOCBQUEUED) {
+                /* All IO is now issued, send it on its way */
+                blk_run_address_space(inode->i_mapping);
                dio_await_completion(dio);
+        }
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                rw = WRITE_ODIRECT;
+                rw = WRITE_ODIRECT_PLUG;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fad..0df243850818 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g,
        struct dlm_comms *cms = NULL;
        void *gps = NULL;
-        cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
+        cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
-        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+        gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
-        sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
-        cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
        if (!cl || !gps || !sps || !cms)
                goto fail;
@@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
        struct dlm_nodes *nds = NULL;
        void *gps = NULL;
-        sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
+        sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
-        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+        gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
-        nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
        if (!sp || !gps || !nds)
                goto fail;
@@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 {
        struct dlm_comm *cm;
-        cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
+        cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
        if (!cm)
                return ERR_PTR(-ENOMEM);
@@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
        struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
        struct dlm_node *nd;
-        nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
+        nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
        if (!nd)
                return ERR_PTR(-ENOMEM);
@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
        if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
                return -ENOSPC;
-        addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+        addr = kzalloc(sizeof(*addr), GFP_NOFS);
        if (!addr)
                return -ENOMEM;
@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
        ids_count = sp->members_count;
-        ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
+        ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
        if (!ids) {
                rv = -ENOMEM;
                goto out;
@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
        if (!new_count)
                goto out_ids;
-        new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+        new = kcalloc(new_count, sizeof(int), GFP_NOFS);
        if (!new) {
                kfree(ids);
                rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82e..375a2359b3bf 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
        if (bucket >= ls->ls_rsbtbl_size)
                return NULL;
-        ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
+        ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
        if (!ri)
                return NULL;
        if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86f..7b84c1dbc82e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
        spin_unlock(&ls->ls_recover_list_lock);
        if (!found)
-                de = kzalloc(sizeof(struct dlm_direntry) + len,
+                de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
-                             ls->ls_allocation);
        return de;
 }
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        dlm_dir_clear(ls);
-        last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
        if (!last_name)
                goto out;
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        if (namelen > DLM_RESNAME_MAXLEN)
                return -EINVAL;
-        de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
+        de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
        if (!de)
                return -ENOMEM;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711db..826d3dc6e0ab 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -473,7 +473,6 @@ struct dlm_ls {
        int                     ls_low_nodeid;
        int                     ls_total_weight;
        int                     *ls_node_array;
-        gfp_t                   ls_allocation;
        struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
        struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5f..9c0c1db1e105 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
           pass into lowcomms_commit and a message buffer (mb) that we
           write our data into */
-        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
        if (!mh)
                return -ENOBUFS;
@@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        }
        if (flags & DLM_LKF_VALBLK) {
-                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
                if (!ua->lksb.sb_lvbptr) {
                        kfree(ua);
                        __put_lkb(ls, lkb);
@@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        ua = lkb->lkb_ua;
        if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
-                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
                if (!ua->lksb.sb_lvbptr) {
                        error = -ENOMEM;
                        goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc86713..c010ecfc0d29 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        error = -ENOMEM;
-        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
        if (!ls)
                goto out;
        memcpy(ls->ls_name, name, namelen);
@@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        if (flags & DLM_LSFL_TIMEWARN)
                set_bit(LSFL_TIMEWARN, &ls->ls_flags);
-        if (flags & DLM_LSFL_FS)
-                ls->ls_allocation = GFP_NOFS;
-        else
-                ls->ls_allocation = GFP_KERNEL;
        /* ls_exflags are forced to match among nodes, and we don't
           need to require all nodes to have some flags set */
        ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_rsbtbl_size;
        ls->ls_rsbtbl_size = size;
-        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
        if (!ls->ls_rsbtbl)
                goto out_lsfree;
        for (i = 0; i < size; i++) {
@@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_lkbtbl_size;
        ls->ls_lkbtbl_size = size;
-        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
        if (!ls->ls_lkbtbl)
                goto out_rsbfree;
        for (i = 0; i < size; i++) {
@@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        size = dlm_config.ci_dirtbl_size;
        ls->ls_dirtbl_size = size;
-        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
        if (!ls->ls_dirtbl)
                goto out_lkbfree;
        for (i = 0; i < size; i++) {
@@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        mutex_init(&ls->ls_requestqueue_mutex);
        mutex_init(&ls->ls_clear_proc_locks);
-        ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+        ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
        if (!ls->ls_recover_buf)
                goto out_dirfree;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b516..52cab160893c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1060,7 +1060,7 @@ static void init_local(void)
                if (dlm_our_addr(&sas, i))
                        break;
-                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+                addr = kmalloc(sizeof(*addr), GFP_NOFS);
                if (!addr)
                        break;
                memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
        struct sockaddr_storage localaddr;
        struct sctp_event_subscribe subscribe;
        int result = -EINVAL, num = 1, i, addr_len;
-        struct connection *con = nodeid2con(0, GFP_KERNEL);
+        struct connection *con = nodeid2con(0, GFP_NOFS);
        int bufsize = NEEDED_RMEM;
        if (!con)
@@ -1171,7 +1171,7 @@ out:
 static int tcp_listen_for_all(void)
 {
        struct socket *sock = NULL;
-        struct connection *con = nodeid2con(0, GFP_KERNEL);
+        struct connection *con = nodeid2con(0, GFP_NOFS);
        int result = -EINVAL;
        if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b2..84f70bfb0baf 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
        struct dlm_member *memb;
        int w, error;
-        memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+        memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
        if (!memb)
                return -ENOMEM;
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
        ls->ls_total_weight = total;
-        array = kmalloc(sizeof(int) * total, ls->ls_allocation);
+        array = kmalloc(sizeof(int) * total, GFP_NOFS);
        if (!array)
                return;
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                        continue;
                log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-                memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+                memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
                if (!memb)
                        return -ENOMEM;
                memb->nodeid = rv->new[i];
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
        int *ids = NULL, *new = NULL;
        int error, ids_count = 0, new_count = 0;
-        rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
+        rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
        if (!rv)
                return -ENOMEM;
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84ebab..8e0d00db004f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
-        p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
+        p = kzalloc(ls->ls_lvblen, GFP_NOFS);
        return p;
 }
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
-        r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
+        r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
        return r;
 }
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
+        lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
        return lkb;
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a9..052095cd592f 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
        struct sk_buff *skb;
        void *data;
-        skb = genlmsg_new(size, GFP_KERNEL);
+        skb = genlmsg_new(size, GFP_NOFS);
        if (!skb)
                return -ENOMEM;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..b5f89aef3b29 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+        xop = kzalloc(sizeof(*xop), GFP_NOFS);
        if (!xop) {
                rv = -ENOMEM;
                goto out;
@@ -143,7 +143,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(dlm_posix_lock);
-/* Returns failure iff a succesful lock operation should be canceled */
+/* Returns failure iff a successful lock operation should be canceled */
 static int dlm_plock_callback(struct plock_op *op)
 {
        struct file *file;
@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        op = kzalloc(sizeof(*op), GFP_NOFS);
        if (!op) {
                rv = -ENOMEM;
                goto out;
@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (!ls)
                return -EINVAL;
-        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        op = kzalloc(sizeof(*op), GFP_NOFS);
        if (!op) {
                rv = -ENOMEM;
                goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c14..3c83a49a48a3 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
        char *mb;
        int mb_len = sizeof(struct dlm_rcom) + len;
-        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
        if (!mh) {
                log_print("create_rcom to %d type %d len %d ENOBUFS",
                          to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c08911..a44fa22890e1 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
        struct rq_entry *e;
        int length = ms->m_header.h_length - sizeof(struct dlm_message);
-        e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
+        e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
        if (!e) {
                log_print("dlm_add_requestqueue: out of memory len %d", length);
                return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b7..e73a4bb572aa 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
                goto out;
        }
-        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
        if (!ua)
                goto out;
        ua->proc = proc;
@@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
        if (!ls)
                return -ENOENT;
-        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
        if (!ua)
                goto out;
        ua->proc = proc;
@@ -352,7 +352,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
        error = -ENOMEM;
        len = strlen(name) + strlen(name_prefix) + 2;
-        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+        ls->ls_device.name = kzalloc(len, GFP_NOFS);
        if (!ls->ls_device.name)
                goto fail;
@@ -520,7 +520,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
                return -EINVAL;
-        kbuf = kzalloc(count + 1, GFP_KERNEL);
+        kbuf = kzalloc(count + 1, GFP_NOFS);
        if (!kbuf)
                return -ENOMEM;
@@ -546,7 +546,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                /* add 1 after namelen so that the name string is terminated */
                kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
-                               GFP_KERNEL);
+                               GFP_NOFS);
                if (!kbuf) {
                        kfree(k32buf);
                        return -ENOMEM;
@@ -648,7 +648,7 @@ static int device_open(struct inode *inode, struct file *file)
        if (!ls)
                return -ENOENT;
-        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
        if (!proc) {
                dlm_put_lockspace(ls);
                return -ENOMEM;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c063420..366c503f9657 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -251,10 +251,10 @@ ctl_table epoll_table[] = {
                .data           = &max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
diff --git a/fs/exec.c b/fs/exec.c
index ba112bd4a339..c0c636e34f60 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -1209,9 +1208,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
-        retval = ima_bprm_check(bprm);
-        if (retval)
-                return retval;
        /* kernel module loader fixup */
        /* so we don't try to load run modprobe in kernel space. */
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119c..2d0f757fda3e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817fe..b1b178e61718 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,6 +49,7 @@
 #define EXOFS_MIN_PID   0x10000 /* Smallest partition ID */
 #define EXOFS_OBJ_OFF   0x10000 /* offset for objects */
 #define EXOFS_SUPER_ID  0x10000 /* object ID for on-disk superblock */
+#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
 #define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
 /* exofs Application specific page/attribute */
@@ -78,17 +79,67 @@ enum {
 #define EXOFS_SUPER_MAGIC       0x5DF5
 /*
- * The file system control block - stored in an object's data (mainly, the one
+ * The file system control block - stored in object EXOFS_SUPER_ID's data.
- * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * This is where the in-memory superblock is stored on disk.
- * on disk.  Right now it just has a magic value, which is basically a sanity
- * check on our ability to communicate with the object store.
 */
+enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
        __le64  s_nextid;       /* Highest object ID used */
-        __le32  s_numfiles;     /* Number of files on fs */
+        __le64  s_numfiles;     /* Number of files on fs */
+        __le32  s_version;      /* == EXOFS_FSCB_VER */
        __le16  s_magic;        /* Magic signature */
        __le16  s_newfs;        /* Non-zero if this is a new fs */
-};
+        /* From here on it's a static part, only written by mkexofs */
+        __le64  s_dev_table_oid;   /* Resurved, not used */
+        __le64  s_dev_table_count; /* == 0 means no dev_table */
+} __packed;
+/*
+ * Describes the raid used in the FS. It is part of the device table.
+ * This here is taken from the pNFS-objects definition. In exofs we
+ * use one raid policy through-out the filesystem. (NOTE: the funny
+ * alignment at begining. We take care of it at exofs_device_table.
+ */
+struct exofs_dt_data_map {
+        __le32  cb_num_comps;
+        __le64  cb_stripe_unit;
+        __le32  cb_group_width;
+        __le32  cb_group_depth;
+        __le32  cb_mirror_cnt;
+        __le32  cb_raid_algorithm;
+} __packed;
+/*
+ * This is an osd device information descriptor. It is a single entry in
+ * the exofs device table. It describes an osd target lun which
+ * contains data belonging to this FS. (Same partition_id on all devices)
+ */
+struct exofs_dt_device_info {
+        __le32  systemid_len;
+        u8      systemid[OSD_SYSTEMID_LEN];
+        __le64  long_name_offset;       /* If !0 then offset-in-file */
+        __le32  osdname_len;            /* */
+        u8      osdname[44];            /* Embbeded, Ususally an asci uuid */
+} __packed;
+/*
+ * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
+ * It contains the raid used for this multy-device FS and an array of
+ * participating devices.
+ */
+struct exofs_device_table {
+        __le32                          dt_version;     /* == EXOFS_DT_VER */
+        struct exofs_dt_data_map        dt_data_map;    /* Raid policy to use */
+        /* Resurved space For future use. Total includeing this:
+         * (8 * sizeof(le64))
+         */
+        __le64                          __Resurved[4];
+        __le64                          dt_num_devices; /* Array size */
+        struct exofs_dt_device_info     dt_dev_table[]; /* Array of devices */
+} __packed;
 /****************************************************************************
 * inode-related things
@@ -155,22 +206,4 @@ enum {
        (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
          EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
-/*************************
- * function declarations *
- *************************/
-/* osd.c                 */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-                           const struct osd_obj_id *obj);
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
-static inline int exofs_check_ok(struct osd_request *or)
-{
-        return exofs_check_ok_resid(or, NULL, NULL);
-}
-int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
-int exofs_async_op(struct osd_request *or,
-        osd_req_done_fn *async_done, void *caller_context, u8 *cred);
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b22..c35fd4623986 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
 * along with exofs; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
 #include <linux/fs.h>
 #include <linux/time.h>
 #include "common.h"
-#ifndef __EXOFS_H__
+/* FIXME: Remove once pnfs hits mainline
-#define __EXOFS_H__
+ * #include <linux/exportfs/pnfs_osd_xdr.h>
+ */
+#include "pnfs.h"
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
@@ -55,7 +59,7 @@
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
-        struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+        struct exofs_fscb s_fscb;               /* Written often, pre-allocate*/
        osd_id          s_pid;                  /* partition ID of file system*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
@@ -63,7 +67,11 @@ struct exofs_sb_info {
        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
-        uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct pnfs_osd_data_map data_map;      /* Default raid to use        */
+        unsigned        s_numdevs;              /* Num of devices in array    */
+        struct osd_dev  *s_ods[1];              /* Variable length, minimum 1 */
 };
 /*
@@ -79,6 +87,50 @@ struct exofs_i_info {
        struct inode   vfs_inode;          /* normal in-memory inode          */
 };
+static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
+{
+        return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+}
+struct exofs_io_state;
+typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
+struct exofs_io_state {
+        struct kref             kref;
+        void                    *private;
+        exofs_io_done_fn        done;
+        struct exofs_sb_info    *sbi;
+        struct osd_obj_id       obj;
+        u8                      *cred;
+        /* Global read/write IO*/
+        loff_t                  offset;
+        unsigned long           length;
+        void                    *kern_buff;
+        struct bio              *bio;
+        /* Attributes */
+        unsigned                in_attr_len;
+        struct osd_attr         *in_attr;
+        unsigned                out_attr_len;
+        struct osd_attr         *out_attr;
+        /* Variable array of size numdevs */
+        unsigned numdevs;
+        struct exofs_per_dev_state {
+                struct osd_request *or;
+                struct bio *bio;
+        } per_dev[];
+};
+static inline unsigned exofs_io_state_size(unsigned numdevs)
+{
+        return sizeof(struct exofs_io_state) +
+                sizeof(struct exofs_per_dev_state) * numdevs;
+}
 /*
 * our inode flags
 */
@@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 /*************************
 * function declarations *
 *************************/
+/* ios.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length);
+int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+void exofs_put_io_state(struct exofs_io_state *ios);
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
+int exofs_sbi_create(struct exofs_io_state *ios);
+int exofs_sbi_remove(struct exofs_io_state *ios);
+int exofs_sbi_write(struct exofs_io_state *ios);
+int exofs_sbi_read(struct exofs_io_state *ios);
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
+static inline int exofs_oi_write(struct exofs_i_info *oi,
+                                 struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_write(ios);
+}
+static inline int exofs_oi_read(struct exofs_i_info *oi,
+                                struct exofs_io_state *ios)
+{
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        return exofs_sbi_read(ios);
+}
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
@@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
+extern const struct osd_attr g_attr_logical_length;
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..698a8636d39c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,18 @@
 #include "exofs.h"
-#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG2(M...) do {} while (0)
-#  define EXOFS_DEBUG_OBJ_ISIZE 1
-#endif
+enum { BIO_MAX_PAGES_KMALLOC =
+                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
 struct page_collect {
        struct exofs_sb_info *sbi;
        struct request_queue *req_q;
        struct inode *inode;
        unsigned expected_pages;
+        struct exofs_io_state *ios;
        struct bio *bio;
        unsigned nr_pages;
@@ -54,22 +57,23 @@ struct page_collect {
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
-                struct inode *inode)
+                       struct inode *inode)
 {
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        pcol->sbi = sbi;
-        pcol->req_q = osd_request_queue(sbi->s_dev);
+        /* Create master bios on first Q, later on cloning, each clone will be
+         * allocated on it's destination Q
+         */
+        pcol->req_q = osd_request_queue(sbi->s_ods[0]);
        pcol->inode = inode;
        pcol->expected_pages = expected_pages;
+        pcol->ios = NULL;
        pcol->bio = NULL;
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
-                     expected_pages);
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol)
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
-        EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+        pcol->ios = NULL;
-                     pcol->inode->i_ino, pcol->expected_pages);
        /* this is probably the end of the loop but in writes
         * it might not end here. don't be left with nothing
         */
        if (!pcol->expected_pages)
-                pcol->expected_pages = 128;
+                pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
 }
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-        int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+        int pages = min_t(unsigned, pcol->expected_pages,
+                          BIO_MAX_PAGES_KMALLOC);
+        if (!pcol->ios) { /* First time allocate io_state */
+                int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+                if (ret)
+                        return ret;
+        }
        for (; pages; pages >>= 1) {
-                pcol->bio = bio_alloc(GFP_KERNEL, pages);
+                pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
                if (likely(pcol->bio))
                        return 0;
        }
-        EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+        EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
                  pcol->expected_pages);
        return -ENOMEM;
 }
 static void pcol_free(struct page_collect *pcol)
 {
-        bio_put(pcol->bio);
+        if (pcol->bio) {
-        pcol->bio = NULL;
+                bio_put(pcol->bio);
+                pcol->bio = NULL;
+        }
+        if (pcol->ios) {
+                exofs_put_io_state(pcol->ios);
+                pcol->ios = NULL;
+        }
 }
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
@@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret)
 /* Called at the end of reads, to optionally unlock pages and update their
 * status.
 */
-static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+static int __readpages_done(struct page_collect *pcol, bool do_unlock)
-                            bool do_unlock)
 {
        struct bio_vec *bvec;
        int i;
        u64 resid;
        u64 good_bytes;
        u64 length = 0;
-        int ret = exofs_check_ok_resid(or, &resid, NULL);
+        int ret = exofs_check_io(pcol->ios, &resid);
-        osd_end_request(or);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
@@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
                else
                        page_stat = ret;
-                EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                EXOFS_DBGMSG2("    readpages_done(0x%lx, 0x%lx) %s\n",
                          inode->i_ino, page->index,
                          page_stat ? "bad_bytes" : "good_bytes");
@@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 }
 /* callback of async reads */
-static void readpages_done(struct osd_request *or, void *p)
+static void readpages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
-        __readpages_done(or, pcol, true);
+        __readpages_done(pcol, true);
        atomic_dec(&pcol->sbi->s_curr_pending);
-        kfree(p);
+        kfree(pcol);
 }
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
@@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
                unlock_page(page);
        }
-        pcol_free(pcol);
 }
 static int read_exec(struct page_collect *pcol, bool is_sync)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
        if (!pcol->bio)
@@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        /* see comment in _readpage() about sync reads */
        WARN_ON(is_sync && (pcol->nr_pages != 1));
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+        ios->bio = pcol->bio;
-        if (unlikely(!or)) {
+        ios->length = pcol->length;
-                ret = -ENOMEM;
+        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
-                goto err;
-        }
-        osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
        if (is_sync) {
-                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+                exofs_oi_read(oi, pcol->ios);
-                return __readpages_done(or, pcol, false);
+                return __readpages_done(pcol, false);
        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
        }
        *pcol_copy = *pcol;
-        ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+        ios->done = readpages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_read(oi, ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
        EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  obj.id, _LLU(i_start), pcol->length);
+                  ios->obj.id, _LLU(ios->offset), pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 err:
        if (!is_sync)
                _unlock_pcol_pages(pcol, ret, READ);
-        else /* Pages unlocked by caller in sync mode only free bio */
-                pcol_free(pcol);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -370,12 +375,12 @@ try_again:
        if (len != PAGE_CACHE_SIZE)
                zero_user(page, len, PAGE_CACHE_SIZE - len);
-        EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
        if (ret) {
-                EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
                          "this_len=0x%zx nr_pages=%u length=0x%lx\n",
                          page, len, pcol->nr_pages, pcol->length);
@@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync)
        _pcol_init(&pcol, 1, page->mapping->host);
-        /* readpage_strip might call read_exec(,async) inside at several places
+        /* readpage_strip might call read_exec(,is_sync==false) at several
-         * but this is safe for is_async=0 since read_exec will not do anything
+         * places but not if we have a single page.
-         * when we have a single page.
         */
        ret = readpage_strip(&pcol, page);
        if (ret) {
@@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page)
        return _readpage(page, false);
 }
-/* Callback for osd_write. All writes are asynchronouse */
+/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct osd_request *or, void *p)
+static void writepages_done(struct exofs_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
        struct bio_vec *bvec;
@@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p)
        u64 resid;
        u64  good_bytes;
        u64  length = 0;
+        int ret = exofs_check_io(ios, &resid);
-        int ret = exofs_check_ok_resid(or, NULL, &resid);
-        osd_end_request(or);
        atomic_dec(&pcol->sbi->s_curr_pending);
        if (likely(!ret))
                good_bytes = pcol->length;
-        else if (!resid)
-                good_bytes = 0;
        else
                good_bytes = pcol->length - resid;
@@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p)
                update_write_page(page, page_stat);
                unlock_page(page);
-                EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
                             inode->i_ino, page->index, page_stat);
                length += bvec->bv_len;
@@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p)
 static int write_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct osd_obj_id obj = {pcol->sbi->s_pid,
+        struct exofs_io_state *ios = pcol->ios;
-                                        pcol->inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or = NULL;
        struct page_collect *pcol_copy = NULL;
-        loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
        int ret;
        if (!pcol->bio)
                return 0;
-        or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
-                ret = -ENOMEM;
-                goto err;
-        }
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
                EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
        pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-        osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
-        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+        ios->bio = pcol_copy->bio;
+        ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
+        ios->length = pcol_copy->length;
+        ios->done = writepages_done;
+        ios->private = pcol_copy;
+        ret = exofs_oi_write(oi, ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+                EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
                goto err;
        }
        atomic_inc(&pcol->sbi->s_curr_pending);
        EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
                  pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol)
 err:
        _unlock_pcol_pages(pcol, ret, WRITE);
+        pcol_free(pcol);
        kfree(pcol_copy);
-        if (or)
-                osd_end_request(or);
        return ret;
 }
@@ -586,6 +582,9 @@ static int writepage_strip(struct page *page,
                        if (PageError(page))
                                ClearPageError(page);
                        unlock_page(page);
+                        EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
+                                     "outside the limits\n",
+                                     inode->i_ino, page->index);
                        return 0;
                }
        }
@@ -600,6 +599,9 @@ try_again:
                ret = write_exec(pcol);
                if (unlikely(ret))
                        goto fail;
+                EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
+                             inode->i_ino, page->index);
                goto try_again;
        }
@@ -609,7 +611,7 @@ try_again:
                        goto fail;
        }
-        EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+        EXOFS_DBGMSG2("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
                     inode->i_ino, page->index, len);
        ret = pcol_add_page(pcol, page, len);
@@ -634,6 +636,8 @@ try_again:
        return 0;
 fail:
+        EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
+                     inode->i_ino, page->index, ret);
        set_bit(AS_EIO, &page->mapping->flags);
        unlock_page(page);
        return ret;
@@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping,
                        wbc->range_end >> PAGE_CACHE_SHIFT;
        if (start || end)
-                expected_pages = min(end - start + 1, 32L);
+                expected_pages = end - start + 1;
        else
                expected_pages = mapping->nrpages;
-        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+        if (expected_pages < 32L)
-                     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                expected_pages = 32L;
+        EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+                     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
                     mapping->host->i_ino, wbc->range_start, wbc->range_end,
-                     mapping->nrpages, start, end);
+                     mapping->nrpages, start, end, expected_pages);
        _pcol_init(&pcol, expected_pages, mapping->host);
@@ -771,19 +778,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+static int _do_truncate(struct inode *inode)
+{
+        struct exofs_i_info *oi = exofs_i(inode);
+        loff_t isize = i_size_read(inode);
+        int ret;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        ret = exofs_oi_truncate(oi, (u64)isize);
+        EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+        return ret;
+}
 /*
 * Truncate a file to the specified size - all we have to do is set the size
 * attribute.  We make sure the object exists first.
 */
 void exofs_truncate(struct inode *inode)
 {
-        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        struct exofs_i_info *oi = exofs_i(inode);
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
-        struct osd_request *or;
-        struct osd_attr attr;
-        loff_t isize = i_size_read(inode);
-        __be64 newsize;
        int ret;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +809,6 @@ void exofs_truncate(struct inode *inode)
                return;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
-                goto fail;
-        }
-        osd_req_set_attributes(or, &obj);
-        newsize = cpu_to_be64((u64)isize);
-        attr = g_attr_logical_length;
-        attr.val_ptr = &newsize;
-        osd_req_add_set_attr_list(or, &attr, 1);
        /* if we are about to truncate an object, and it hasn't been
         * created yet, wait
@@ -816,8 +816,7 @@ void exofs_truncate(struct inode *inode)
        if (unlikely(wait_obj_created(oi)))
                goto fail;
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ret = _do_truncate(inode);
-        osd_end_request(or);
        if (ret)
                goto fail;
@@ -847,65 +846,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 /*
 * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * attribute in the 'obj_size' argument.
- * on.
 */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-                    struct exofs_fcb *inode, uint64_t *sanity)
+                    struct exofs_fcb *inode, uint64_t *obj_size)
 {
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_request *or;
+        struct osd_attr attrs[2];
-        struct osd_attr attr;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj = {sbi->s_pid,
-                                 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
        int ret;
-        exofs_make_credential(oi->i_cred, &obj);
+        *obj_size = ~0;
+        ret = exofs_get_io_state(sbi, &ios);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        if (unlikely(ret)) {
-        if (unlikely(!or)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-                EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+                return ret;
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
-        /* we need the inode attribute */
+        ios->obj.id = exofs_oi_objno(oi);
-        osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+        exofs_make_credential(oi->i_cred, &ios->obj);
+        ios->cred = oi->i_cred;
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        attrs[0] = g_attr_inode_data;
-        /* we get the size attributes to do a sanity check */
+        attrs[1] = g_attr_logical_length;
-        osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+        ios->in_attr = attrs;
-#endif
+        ios->in_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+        ret = exofs_sbi_read(ios);
        if (ret)
                goto out;
-        attr = g_attr_inode_data;
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+                EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
                goto out;
        }
+        WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
+        memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
-        WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
-        memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
-        attr = g_attr_logical_length;
-        ret = extract_attr_from_req(or, &attr);
        if (ret) {
-                EXOFS_ERR("ERROR: extract attr from or failed\n");
+                EXOFS_ERR("%s: extract_attr of logical_length failed\n",
+                          __func__);
                goto out;
        }
-        *sanity = get_unaligned_be64(attr.val_ptr);
+        *obj_size = get_unaligned_be64(attrs[1].val_ptr);
-#endif
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
+static void __oi_init(struct exofs_i_info *oi)
+{
+        init_waitqueue_head(&oi->i_wq);
+        oi->i_flags = 0;
+}
 /*
 * Fill in an inode read from the OSD and set it up for use
 */
@@ -914,7 +910,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        struct exofs_i_info *oi;
        struct exofs_fcb fcb;
        struct inode *inode;
-        uint64_t uninitialized_var(sanity);
+        uint64_t obj_size;
        int ret;
        inode = iget_locked(sb, ino);
@@ -923,13 +919,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        if (!(inode->i_state & I_NEW))
                return inode;
        oi = exofs_i(inode);
+        __oi_init(oi);
        /* read the inode from the osd */
-        ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+        ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
        if (ret)
                goto bad_inode;
-        init_waitqueue_head(&oi->i_wq);
        set_obj_created(oi);
        /* copy stuff from on-disk struct to in-memory struct */
@@ -947,14 +943,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_generation = le32_to_cpu(fcb.i_generation);
-#ifdef EXOFS_DEBUG_OBJ_ISIZE
+        if ((inode->i_size != obj_size) &&
-        if ((inode->i_size != sanity) &&
                (!exofs_inode_is_fast_symlink(inode))) {
-                EXOFS_ERR("WARNING: Size of object from inode and "
+                EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
-                          "attributes differ (%lld != %llu)\n",
+                          inode->i_size, _LLU(obj_size));
-                          inode->i_size, _LLU(sanity));
+                /* FIXME: call exofs_inode_recovery() */
        }
-#endif
        oi->i_dir_start_lookup = 0;
@@ -1020,23 +1014,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 * set the obj_created flag so that other methods know that the object exists on
 * the OSD.
 */
-static void create_done(struct osd_request *or, void *p)
+static void create_done(struct exofs_io_state *ios, void *p)
 {
        struct inode *inode = p;
        struct exofs_i_info *oi = exofs_i(inode);
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        int ret;
-        ret = exofs_check_ok(or);
+        ret = exofs_check_io(ios, NULL);
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
                EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-                          _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
-                make_bad_inode(inode);
+                /*TODO: When FS is corrupted creation can fail, object already
-        } else
+                 * exist. Get rid of this asynchronous creation, if exist
-                set_obj_created(oi);
+                 * increment the obj counter and try the next object. Until we
+                 * succeed. All these dangling objects will be made into lost
+                 * files by chkfs.exofs
+                 */
+        }
+        set_obj_created(oi);
        atomic_dec(&inode->i_count);
        wake_up(&oi->i_wq);
@@ -1051,8 +1052,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        struct inode *inode;
        struct exofs_i_info *oi;
        struct exofs_sb_info *sbi;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret;
        sb = dir->i_sb;
@@ -1061,8 +1061,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
                return ERR_PTR(-ENOMEM);
        oi = exofs_i(inode);
+        __oi_init(oi);
-        init_waitqueue_head(&oi->i_wq);
        set_obj_2bcreated(oi);
        sbi = sb->s_fs_info;
@@ -1089,28 +1089,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        mark_inode_dirty(inode);
-        obj.partition = sbi->s_pid;
+        ret = exofs_get_io_state(sbi, &ios);
-        obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+        if (unlikely(ret)) {
-        exofs_make_credential(oi->i_cred, &obj);
+                EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+                return ERR_PTR(ret);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
-        if (unlikely(!or)) {
-                EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
-                return ERR_PTR(-ENOMEM);
        }
-        osd_req_create_object(or, &obj);
+        ios->obj.id = exofs_oi_objno(oi);
+        exofs_make_credential(oi->i_cred, &ios->obj);
        /* increment the refcount so that the inode will still be around when we
         * reach the callback
         */
        atomic_inc(&inode->i_count);
-        ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+        ios->done = create_done;
+        ios->private = inode;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_create(ios);
        if (ret) {
                atomic_dec(&inode->i_count);
-                osd_end_request(or);
+                exofs_put_io_state(ios);
-                return ERR_PTR(-EIO);
+                return ERR_PTR(ret);
        }
        atomic_inc(&sbi->s_curr_pending);
@@ -1128,11 +1128,11 @@ struct updatei_args {
 /*
 * Callback function from exofs_update_inode().
 */
-static void updatei_done(struct osd_request *or, void *p)
+static void updatei_done(struct exofs_io_state *ios, void *p)
 {
        struct updatei_args *args = p;
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        atomic_dec(&args->sbi->s_curr_pending);
@@ -1148,8 +1148,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        struct osd_attr attr;
        struct exofs_fcb *fcb;
        struct updatei_args *args;
@@ -1186,18 +1185,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        } else
                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(sbi, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
-                ret = -ENOMEM;
                goto free_args;
        }
-        osd_req_set_attributes(or, &obj);
        attr = g_attr_inode_data;
        attr.val_ptr = fcb;
-        osd_req_add_set_attr_list(or, &attr, 1);
+        ios->out_attr_len = 1;
+        ios->out_attr = &attr;
        if (!obj_created(oi)) {
                EXOFS_DBGMSG("!obj_created\n");
@@ -1206,22 +1203,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
                EXOFS_DBGMSG("wait_event done\n");
        }
-        if (do_sync) {
+        if (!do_sync) {
-                ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
-                osd_end_request(or);
-                goto free_args;
-        } else {
                args->sbi = sbi;
+                ios->done = updatei_done;
+                ios->private = args;
+        }
-                ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+        ret = exofs_oi_write(oi, ios);
-                if (ret) {
+        if (!do_sync && !ret) {
-                        osd_end_request(or);
-                        goto free_args;
-                }
                atomic_inc(&sbi->s_curr_pending);
                goto out; /* deallocation in updatei_done */
        }
+        exofs_put_io_state(ios);
 free_args:
        kfree(args);
 out:
@@ -1238,11 +1232,12 @@ int exofs_write_inode(struct inode *inode, int wait)
 * Callback function from exofs_delete_inode() - don't have much cleaning up to
 * do.
 */
-static void delete_done(struct osd_request *or, void *p)
+static void delete_done(struct exofs_io_state *ios, void *p)
 {
-        struct exofs_sb_info *sbi;
+        struct exofs_sb_info *sbi = p;
-        osd_end_request(or);
-        sbi = p;
+        exofs_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
 }
@@ -1256,8 +1251,7 @@ void exofs_delete_inode(struct inode *inode)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+        struct exofs_io_state *ios;
-        struct osd_request *or;
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1268,26 @@ void exofs_delete_inode(struct inode *inode)
        clear_inode(inode);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_get_io_state(sbi, &ios);
-        if (unlikely(!or)) {
+        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+                EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
                return;
        }
-        osd_req_remove_object(or, &obj);
        /* if we are deleting an obj that hasn't been created yet, wait */
        if (!obj_created(oi)) {
                BUG_ON(!obj_2bcreated(oi));
                wait_event(oi->i_wq, obj_created(oi));
        }
-        ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->done = delete_done;
+        ios->private = sbi;
+        ios->cred = oi->i_cred;
+        ret = exofs_sbi_remove(ios);
        if (ret) {
-                EXOFS_ERR(
+                EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
-                       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+                exofs_put_io_state(ios);
-                osd_end_request(or);
                return;
        }
        atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 000000000000..5bad01fa1f9f
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <scsi/scsi_device.h>
+#include "exofs.h"
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length)
+{
+        struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*      struct osd_sense_info osi = {.key = 0};*/
+        int ret;
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+                return -ENOMEM;
+        }
+        ret = osd_req_read_kern(or, obj, offset, p, length);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+                goto out;
+        }
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+                goto out;
+        }
+        ret = osd_execute_request(or);
+        if (unlikely(ret))
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+out:
+        osd_end_request(or);
+        return ret;
+}
+int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+{
+        struct exofs_io_state *ios;
+        /*TODO: Maybe use kmem_cach per sbi of size
+         * exofs_io_state_size(sbi->s_numdevs)
+         */
+        ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+        if (unlikely(!ios)) {
+                *pios = NULL;
+                return -ENOMEM;
+        }
+        ios->sbi = sbi;
+        ios->obj.partition = sbi->s_pid;
+        *pios = ios;
+        return 0;
+}
+void exofs_put_io_state(struct exofs_io_state *ios)
+{
+        if (ios) {
+                unsigned i;
+                for (i = 0; i < ios->numdevs; i++) {
+                        struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+                        if (per_dev->or)
+                                osd_end_request(per_dev->or);
+                        if (per_dev->bio)
+                                bio_put(per_dev->bio);
+                }
+                kfree(ios);
+        }
+}
+static void _sync_done(struct exofs_io_state *ios, void *p)
+{
+        struct completion *waiting = p;
+        complete(waiting);
+}
+static void _last_io(struct kref *kref)
+{
+        struct exofs_io_state *ios = container_of(
+                                        kref, struct exofs_io_state, kref);
+        ios->done(ios, ios->private);
+}
+static void _done_io(struct osd_request *or, void *p)
+{
+        struct exofs_io_state *ios = p;
+        kref_put(&ios->kref, _last_io);
+}
+static int exofs_io_execute(struct exofs_io_state *ios)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        bool sync = (ios->done == NULL);
+        int i, ret;
+        if (sync) {
+                ios->done = _sync_done;
+                ios->private = &wait;
+        }
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                ret = osd_finalize_request(or, 0, ios->cred, NULL);
+                if (unlikely(ret)) {
+                        EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
+                                     ret);
+                        return ret;
+                }
+        }
+        kref_init(&ios->kref);
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (unlikely(!or))
+                        continue;
+                kref_get(&ios->kref);
+                osd_execute_request_async(or, _done_io, ios);
+        }
+        kref_put(&ios->kref, _last_io);
+        ret = 0;
+        if (sync) {
+                wait_for_completion(&wait);
+                ret = exofs_check_io(ios, NULL);
+        }
+        return ret;
+}
+int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+{
+        enum osd_err_priority acumulated_osd_err = 0;
+        int acumulated_lin_err = 0;
+        int i;
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_sense_info osi;
+                int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+                if (likely(!ret))
+                        continue;
+                if (unlikely(ret == -EFAULT)) {
+                        EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
+                        /*FIXME: All the pages in this device range should:
+                         *      clear_highpage(page);
+                         */
+                }
+                if (osi.osd_err_pri >= acumulated_osd_err) {
+                        acumulated_osd_err = osi.osd_err_pri;
+                        acumulated_lin_err = ret;
+                }
+        }
+        /* TODO: raid specific residual calculations */
+        if (resid) {
+                if (likely(!acumulated_lin_err))
+                        *resid = 0;
+                else
+                        *resid = ios->length;
+        }
+        return acumulated_lin_err;
+}
+int exofs_sbi_create(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_create_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_remove(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_remove_object(or, &ios->obj);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < ios->sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                if (ios->bio) {
+                        struct bio *bio;
+                        if (i != 0) {
+                                bio = bio_kmalloc(GFP_KERNEL,
+                                                  ios->bio->bi_max_vecs);
+                                if (unlikely(!bio)) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                __bio_clone(bio, ios->bio);
+                                bio->bi_bdev = NULL;
+                                bio->bi_next = NULL;
+                                ios->per_dev[i].bio =  bio;
+                        } else {
+                                bio = ios->bio;
+                        }
+                        osd_req_write(or, &ios->obj, ios->offset, bio,
+                                      ios->length);
+/*                      EXOFS_DBGMSG("write sync=%d\n", sync);*/
+                } else if (ios->kern_buff) {
+                        osd_req_write_kern(or, &ios->obj, ios->offset,
+                                           ios->kern_buff, ios->length);
+/*                      EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+                } else {
+                        osd_req_set_attributes(or, &ios->obj);
+/*                      EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+                }
+                if (ios->out_attr)
+                        osd_req_add_set_attr_list(or, ios->out_attr,
+                                                  ios->out_attr_len);
+                if (ios->in_attr)
+                        osd_req_add_get_attr_list(or, ios->in_attr,
+                                                  ios->in_attr_len);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+        int i, ret;
+        for (i = 0; i < 1; i++) {
+                struct osd_request *or;
+                unsigned first_dev = (unsigned)ios->obj.id;
+                first_dev %= ios->sbi->s_numdevs;
+                or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                if (ios->bio) {
+                        osd_req_read(or, &ios->obj, ios->offset, ios->bio,
+                                     ios->length);
+/*                      EXOFS_DBGMSG("read sync=%d\n", sync);*/
+                } else if (ios->kern_buff) {
+                        osd_req_read_kern(or, &ios->obj, ios->offset,
+                                           ios->kern_buff, ios->length);
+/*                      EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+                } else {
+                        osd_req_get_attributes(or, &ios->obj);
+/*                      EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+                }
+                if (ios->out_attr)
+                        osd_req_add_set_attr_list(or, ios->out_attr,
+                                                  ios->out_attr_len);
+                if (ios->in_attr)
+                        osd_req_add_get_attr_list(or, ios->in_attr,
+                                                  ios->in_attr_len);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        return ret;
+}
+int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+{
+        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+        void *iter = NULL;
+        int nelem;
+        do {
+                nelem = 1;
+                osd_req_decode_get_attr_list(ios->per_dev[0].or,
+                                             &cur_attr, &nelem, &iter);
+                if ((cur_attr.attr_page == attr->attr_page) &&
+                    (cur_attr.attr_id == attr->attr_id)) {
+                        attr->len = cur_attr.len;
+                        attr->val_ptr = cur_attr.val_ptr;
+                        return 0;
+                }
+        } while (iter);
+        return -EIO;
+}
+int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+{
+        struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+        struct exofs_io_state *ios;
+        struct osd_attr attr;
+        __be64 newsize;
+        int i, ret;
+        if (exofs_get_io_state(sbi, &ios))
+                return -ENOMEM;
+        ios->obj.id = exofs_oi_objno(oi);
+        ios->cred = oi->i_cred;
+        newsize = cpu_to_be64(size);
+        attr = g_attr_logical_length;
+        attr.val_ptr = &newsize;
+        for (i = 0; i < sbi->s_numdevs; i++) {
+                struct osd_request *or;
+                or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+                if (unlikely(!or)) {
+                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ios->per_dev[i].or = or;
+                ios->numdevs++;
+                osd_req_set_attributes(or, &ios->obj);
+                osd_req_add_set_attr_list(or, &attr, 1);
+        }
+        ret = exofs_io_execute(ios);
+out:
+        exofs_put_io_state(ios);
+        return ret;
+}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df284..000000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <scsi/scsi_device.h>
-#include <scsi/osd_sense.h>
-#include "exofs.h"
-int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
-{
-        struct osd_sense_info osi;
-        int ret = osd_req_decode_sense(or, &osi);
-        if (ret) { /* translate to Linux codes */
-                if (osi.additional_code == scsi_invalid_field_in_cdb) {
-                        if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
-                                ret = -EFAULT;
-                        if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
-                                ret = -ENOENT;
-                        else
-                                ret = -EINVAL;
-                } else if (osi.additional_code == osd_quota_error)
-                        ret = -ENOSPC;
-                else
-                        ret = -EIO;
-        }
-        /* FIXME: should be include in osd_sense_info */
-        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->resid_len : 0;
-        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->resid_len : 0;
-        return ret;
-}
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
-        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-/*
- * Perform a synchronous OSD operation.
- */
-int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-        int ret;
-        or->timeout = timeout;
-        ret = osd_finalize_request(or, 0, credential, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request(or);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
-        /* osd_req_decode_sense(or, ret); */
-        return ret;
-}
-/*
- * Perform an asynchronous OSD operation.
- */
-int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-                   void *caller_context, u8 *cred)
-{
-        int ret;
-        ret = osd_finalize_request(or, 0, cred, NULL);
-        if (ret) {
-                EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
-                return ret;
-        }
-        ret = osd_execute_request_async(or, async_done, caller_context);
-        if (ret)
-                EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
-        return ret;
-}
-int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-        void *iter = NULL;
-        int nelem;
-        do {
-                nelem = 1;
-                osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-                if ((cur_attr.attr_page == attr->attr_page) &&
-                    (cur_attr.attr_id == attr->attr_id)) {
-                        attr->len = cur_attr.len;
-                        attr->val_ptr = cur_attr.val_ptr;
-                        return 0;
-                }
-        } while (iter);
-        return -EIO;
-}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..423033addd1f
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License  version 2 as published by the Free
+ * Software Foundation.
+ *
+ */
+/* FIXME: Remove this file once pnfs hits mainline */
+#ifndef __EXOFS_PNFS_H__
+#define __EXOFS_PNFS_H__
+#if defined(CONFIG_PNFS)
+/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
+#include "../nfs/objlayout/pnfs_osd_xdr.h"
+#else /* defined(CONFIG_PNFS) */
+enum pnfs_iomode {
+        IOMODE_READ = 1,
+        IOMODE_RW = 2,
+        IOMODE_ANY = 3,
+};
+/* Layout Structure */
+enum pnfs_osd_raid_algorithm4 {
+        PNFS_OSD_RAID_0         = 1,
+        PNFS_OSD_RAID_4         = 2,
+        PNFS_OSD_RAID_5         = 3,
+        PNFS_OSD_RAID_PQ        = 4     /* Reed-Solomon P+Q */
+};
+struct pnfs_osd_data_map {
+        u32     odm_num_comps;
+        u64     odm_stripe_unit;
+        u32     odm_group_width;
+        u32     odm_group_depth;
+        u32     odm_mirror_cnt;
+        u32     odm_raid_algorithm;
+};
+#endif /* else defined(CONFIG_PNFS) */
+#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b59..a1d1e77b12eb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
-        struct osd_request *or;
+        struct exofs_io_state *ios;
-        struct osd_obj_id obj;
        int ret = -ENOMEM;
-        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
-        if (!fscb) {
-                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return -ENOMEM;
-        }
        lock_super(sb);
        sbi = sb->s_fs_info;
+        fscb = &sbi->s_fscb;
+        ret = exofs_get_io_state(sbi, &ios);
+        if (ret)
+                goto out;
+        /* Note: We only write the changing part of the fscb. .i.e upto the
+         *       the fscb->s_dev_table_oid member. There is no read-modify-write
+         *       here.
+         */
+        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
+        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
        fscb->s_magic = cpu_to_le16(sb->s_magic);
        fscb->s_newfs = 0;
+        fscb->s_version = EXOFS_FSCB_VER;
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ios->obj.id = EXOFS_SUPER_ID;
-        if (unlikely(!or)) {
+        ios->offset = 0;
-                EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+        ios->kern_buff = fscb;
-                goto out;
+        ios->cred = sbi->s_cred;
-        }
-        obj.partition = sbi->s_pid;
+        ret = exofs_sbi_write(ios);
-        obj.id = EXOFS_SUPER_ID;
-        ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-                goto out;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
                goto out;
        }
        sb->s_dirt = 0;
 out:
-        if (or)
+        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
-                osd_end_request(or);
+        exofs_put_io_state(ios);
        unlock_super(sb);
-        kfree(fscb);
        return ret;
 }
@@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb)
                sb->s_dirt = 0;
 }
+static void _exofs_print_device(const char *msg, const char *dev_path,
+                                struct osd_dev *od, u64 pid)
+{
+        const struct osd_dev_info *odi = osduld_device_info(od);
+        printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
+                msg, dev_path ?: "", odi->osdname, _LLU(pid));
+}
+void exofs_free_sbi(struct exofs_sb_info *sbi)
+{
+        while (sbi->s_numdevs) {
+                int i = --sbi->s_numdevs;
+                struct osd_dev *od = sbi->s_ods[i];
+                if (od) {
+                        sbi->s_ods[i] = NULL;
+                        osduld_put_device(od);
+                }
+        }
+        kfree(sbi);
+}
 /*
 * This function is called when the vfs is freeing the superblock.  We just
 * need to free our own part.
@@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb)
                                  msecs_to_jiffies(100));
        }
-        osduld_put_device(sbi->s_dev);
+        _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
-        kfree(sb->s_fs_info);
+        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
+static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
+                                    struct exofs_device_table *dt)
+{
+        sbi->data_map.odm_num_comps   =
+                                le32_to_cpu(dt->dt_data_map.cb_num_comps);
+        sbi->data_map.odm_stripe_unit =
+                                le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
+        sbi->data_map.odm_group_width =
+                                le32_to_cpu(dt->dt_data_map.cb_group_width);
+        sbi->data_map.odm_group_depth =
+                                le32_to_cpu(dt->dt_data_map.cb_group_depth);
+        sbi->data_map.odm_mirror_cnt  =
+                                le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+        sbi->data_map.odm_raid_algorithm  =
+                                le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
+/* FIXME: Hard coded mirror only for now. if not so do not mount */
+        if ((sbi->data_map.odm_num_comps != numdevs) ||
+            (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
+            (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
+            (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+                return -EINVAL;
+        else
+                return 0;
+}
+/* @odi is valid only as long as @fscb_dev is valid */
+static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
+                             struct osd_dev_info *odi)
+{
+        odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
+        memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+        odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
+        odi->osdname = dt_dev->osdname;
+        /* FIXME support long names. Will need a _put function */
+        if (dt_dev->long_name_offset)
+                return -EINVAL;
+        /* Make sure osdname is printable!
+         * mkexofs should give us space for a null-terminator else the
+         * device-table is invalid.
+         */
+        if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
+                odi->osdname_len = sizeof(dt_dev->osdname) - 1;
+        dt_dev->osdname[odi->osdname_len] = 0;
+        /* If it's all zeros something is bad we read past end-of-obj */
+        return !(odi->systemid_len || odi->osdname_len);
+}
+static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+                                       unsigned table_count)
+{
+        struct exofs_sb_info *sbi = *psbi;
+        struct osd_dev *fscb_od;
+        struct osd_obj_id obj = {.partition = sbi->s_pid,
+                                 .id = EXOFS_DEVTABLE_ID};
+        struct exofs_device_table *dt;
+        unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
+                                             sizeof(*dt);
+        unsigned numdevs, i;
+        int ret;
+        dt = kmalloc(table_bytes, GFP_KERNEL);
+        if (unlikely(!dt)) {
+                EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
+                          table_bytes);
+                return -ENOMEM;
+        }
+        fscb_od = sbi->s_ods[0];
+        sbi->s_ods[0] = NULL;
+        sbi->s_numdevs = 0;
+        ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+        if (unlikely(ret)) {
+                EXOFS_ERR("ERROR: reading device table\n");
+                goto out;
+        }
+        numdevs = le64_to_cpu(dt->dt_num_devices);
+        if (unlikely(!numdevs)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        WARN_ON(table_count != numdevs);
+        ret = _read_and_match_data_map(sbi, numdevs, dt);
+        if (unlikely(ret))
+                goto out;
+        if (likely(numdevs > 1)) {
+                unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+                sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+                if (unlikely(!sbi)) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+                *psbi = sbi;
+        }
+        for (i = 0; i < numdevs; i++) {
+                struct exofs_fscb fscb;
+                struct osd_dev_info odi;
+                struct osd_dev *od;
+                if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
+                        EXOFS_ERR("ERROR: Read all-zeros device entry\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+                printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
+                       i, odi.osdname);
+                /* On all devices the device table is identical. The user can
+                 * specify any one of the participating devices on the command
+                 * line. We always keep them in device-table order.
+                 */
+                if (fscb_od && osduld_device_same(fscb_od, &odi)) {
+                        sbi->s_ods[i] = fscb_od;
+                        ++sbi->s_numdevs;
+                        fscb_od = NULL;
+                        continue;
+                }
+                od = osduld_info_lookup(&odi);
+                if (unlikely(IS_ERR(od))) {
+                        ret = PTR_ERR(od);
+                        EXOFS_ERR("ERROR: device requested is not found "
+                                  "osd_name-%s =>%d\n", odi.osdname, ret);
+                        goto out;
+                }
+                sbi->s_ods[i] = od;
+                ++sbi->s_numdevs;
+                /* Read the fscb of the other devices to make sure the FS
+                 * partition is there.
+                 */
+                ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+                                      sizeof(fscb));
+                if (unlikely(ret)) {
+                        EXOFS_ERR("ERROR: Malformed participating device "
+                                  "error reading fscb osd_name-%s\n",
+                                  odi.osdname);
+                        goto out;
+                }
+                /* TODO: verify other information is correct and FS-uuid
+                 *       matches. Benny what did you say about device table
+                 *       generation and old devices?
+                 */
+        }
+out:
+        kfree(dt);
+        if (unlikely(!ret && fscb_od)) {
+                EXOFS_ERR(
+                      "ERROR: Bad device-table container device not present\n");
+                osduld_put_device(fscb_od);
+                ret = -EINVAL;
+        }
+        return ret;
+}
 /*
 * Read the superblock from the OSD and fill in the fields
 */
@@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        struct exofs_mountopt *opts = data;
        struct exofs_sb_info *sbi;      /*extended info                  */
+        struct osd_dev *od;             /* Master device                 */
        struct exofs_fscb fscb;         /*on-disk superblock info        */
-        struct osd_request *or = NULL;
        struct osd_obj_id obj;
+        unsigned table_count;
        int ret;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        sb->s_fs_info = sbi;
        /* use mount options to fill superblock */
-        sbi->s_dev = osduld_path_lookup(opts->dev_name);
+        od = osduld_path_lookup(opts->dev_name);
-        if (IS_ERR(sbi->s_dev)) {
+        if (IS_ERR(od)) {
-                ret = PTR_ERR(sbi->s_dev);
+                ret = PTR_ERR(od);
-                sbi->s_dev = NULL;
                goto free_sbi;
        }
+        sbi->s_ods[0] = od;
+        sbi->s_numdevs = 1;
        sbi->s_pid = opts->pid;
        sbi->s_timeout = opts->timeout;
@@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_bdev = NULL;
        sb->s_dev = 0;
-        /* read data from on-disk superblock object */
        obj.partition = sbi->s_pid;
        obj.id = EXOFS_SUPER_ID;
        exofs_make_credential(sbi->s_cred, &obj);
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+        ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(!or)) {
+        if (unlikely(ret))
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_start_request failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR(
-                               "exofs_fill_super: osd_req_read_kern failed.\n");
-                ret = -ENOMEM;
-                goto free_sbi;
-        }
-        ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
-        if (unlikely(ret)) {
-                if (!silent)
-                        EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
-                ret = -EIO;
                goto free_sbi;
-        }
        sb->s_magic = le16_to_cpu(fscb.s_magic);
        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -EINVAL;
                goto free_sbi;
        }
+        if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+                EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
+                          EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
+                ret = -EINVAL;
+                goto free_sbi;
+        }
        /* start generation numbers from a random point */
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        table_count = le64_to_cpu(fscb.s_dev_table_count);
+        if (table_count) {
+                ret = exofs_read_lookup_dev_table(&sbi, table_count);
+                if (unlikely(ret))
+                        goto free_sbi;
+        }
        /* set up operation vectors */
+        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
        root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = 0;
+        _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
-out:
+                            sbi->s_pid);
-        if (or)
+        return 0;
-                osd_end_request(or);
-        return ret;
 free_sbi:
-        osduld_put_device(sbi->s_dev); /* NULL safe */
+        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-        kfree(sbi);
+                  opts->dev_name, sbi->s_pid, ret);
-        goto out;
+        exofs_free_sbi(sbi);
+        return ret;
 }
 /*
@@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct osd_obj_id obj = {sbi->s_pid, 0};
+        struct exofs_io_state *ios;
        struct osd_attr attrs[] = {
                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        uint64_t capacity = ULLONG_MAX;
        uint64_t used = ULLONG_MAX;
-        struct osd_request *or;
        uint8_t cred_a[OSD_CAP_LEN];
        int ret;
-        /* get used/capacity attributes */
+        ret = exofs_get_io_state(sbi, &ios);
-        exofs_make_credential(cred_a, &obj);
+        if (ret) {
+                EXOFS_DBGMSG("exofs_get_io_state failed.\n");
-        or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+                return ret;
-        if (unlikely(!or)) {
-                EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
-                return -ENOMEM;
        }
-        osd_req_get_attributes(or, &obj);
+        exofs_make_credential(cred_a, &ios->obj);
-        osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+        ios->cred = sbi->s_cred;
-        ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+        ios->in_attr = attrs;
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
        if (unlikely(ret))
                goto out;
-        ret = extract_attr_from_req(or, &attrs[0]);
+        ret = extract_attr_from_ios(ios, &attrs[0]);
-        if (likely(!ret))
+        if (likely(!ret)) {
                capacity = get_unaligned_be64(attrs[0].val_ptr);
-        else
+                if (unlikely(!capacity))
+                        capacity = ULLONG_MAX;
+        } else
                EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
-        ret = extract_attr_from_req(or, &attrs[1]);
+        ret = extract_attr_from_ios(ios, &attrs[1]);
        if (likely(!ret))
                used = get_unaligned_be64(attrs[1].val_ptr);
        else
@@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /* fill in the stats buffer */
        buf->f_type = EXOFS_SUPER_MAGIC;
        buf->f_bsize = EXOFS_BLKSIZE;
-        buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+        buf->f_blocks = capacity >> 9;
-        buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+        buf->f_bfree = (capacity - used) >> 9;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_numfiles;
        buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
        buf->f_namelen = EXOFS_NAME_LEN;
 out:
-        osd_end_request(or);
+        exofs_put_io_state(ios);
        return ret;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..2db957778903 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2033,7 +2033,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext3_get_branch(inode, k, offsets, chain, &err);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1a..9acf7e808139 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,6 +26,16 @@ config EXT4_FS
          If unsure, say N.
+config EXT4_USE_FOR_EXT23
+        bool "Use ext4 for ext2/ext3 file systems"
+        depends on EXT3_FS=n || EXT2_FS=n
+        default y
+        help
+          Allow the ext4 file system driver code to be used for ext2 or
+          ext3 file system mounts.  This allows users to reduce their
+          compiled kernel size by using one file system driver for
+          ext2, ext3, and ext4 file systems.
 config EXT4_FS_XATTR
        bool "Ext4 extended attributes"
        depends on EXT4_FS
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..22bc7435d913 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,44 +499,6 @@ error_return:
 }
 /**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:             handle for this transaction
- * @inode:              inode
- * @block:              start physical block to free
- * @count:              number of blocks to count
- * @metadata:           Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
-                        int metadata)
-{
-        struct super_block *sb;
-        unsigned long dquot_freed_blocks;
-        /* this isn't the right place to decide whether block is metadata
-         * inode.c/extents.c knows better, but for safety ... */
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
-        /* We need to make sure we don't reuse
-         * block released untill the transaction commit.
-         * writeback mode have weak data consistency so
-         * don't force data as metadata when freeing block
-         * for writeback mode.
-         */
-        if (metadata == 0 && !ext4_should_writeback_data(inode))
-                metadata = 1;
-        sb = inode->i_sb;
-        ext4_mb_free_blocks(handle, inode, block, count,
-                            metadata, &dquot_freed_blocks);
-        if (dquot_freed_blocks)
-                vfs_dq_free_block(inode, dquot_freed_blocks);
-        return;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
 {
-        return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+        if (!ext4_bg_has_super(sb, group))
+                return 0;
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+        else
+                return EXT4_SB(sb)->s_gdb_count;
 }
 /**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..4df8621ec31c 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
                if (ext4_bg_has_super(sb, i) &&
                    ((i < 5) || ((i % flex_size) == 0)))
                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
-                                        sbi->s_gdb_count + 1);
+                                        ext4_bg_num_gdb(sb, i) + 1);
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
                if (ret)
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        struct rb_node *n = sbi->system_blks.rb_node;
        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;
        while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..ab31e65d46d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -376,6 +376,12 @@ struct ext4_new_group_data {
                                         EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 /*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA       0x0001
+#define EXT4_FREE_BLOCKS_FORGET         0x0002
+/*
 * ioctl commands
 */
 #define EXT4_IOC_GETFLAGS               FS_IOC_GETFLAGS
@@ -703,6 +709,13 @@ struct ext4_inode_info {
        struct list_head i_aio_dio_complete_list;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        /*
+         * Transactions that contain inode's metadata needed to complete
+         * fsync and fdatasync, respectively.
+         */
+        tid_t i_sync_tid;
+        tid_t i_datasync_tid;
 };
 /*
@@ -750,6 +763,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD              0x40000000 /* Issue DISCARD requests */
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)                 o |= EXT4_MOUNT_##opt
@@ -1324,8 +1338,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1396,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
 extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t, unsigned long, int, unsigned long *);
+                             struct buffer_head *bh, ext4_fsblk_t block,
+                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
                                                ext4_group_t, int);
 /* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..b57e5c711b6d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
 #include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
        return err;
 }
-int __ext4_journal_forget(const char *where, handle_t *handle,
+/*
-                                struct buffer_head *bh)
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+                  struct inode *inode, struct buffer_head *bh,
+                  ext4_fsblk_t blocknr)
 {
-        int err = 0;
+        int err;
-        if (ext4_handle_valid(handle)) {
+        might_sleep();
-                err = jbd2_journal_forget(handle, bh);
-                if (err)
+        trace_ext4_forget(inode, is_metadata, blocknr);
-                        ext4_journal_abort_handle(where, __func__, bh,
+        BUFFER_TRACE(bh, "enter");
-                                                  handle, err);
-        }
+        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-        else
+                  "data mode %x\n",
+                  bh, is_metadata, inode->i_mode,
+                  test_opt(inode->i_sb, DATA_FLAGS));
+        /* In the no journal case, we can just do a bforget and return */
+        if (!ext4_handle_valid(handle)) {
                bforget(bh);
-        return err;
+                return 0;
-}
+        }
-int __ext4_journal_revoke(const char *where, handle_t *handle,
+        /* Never use the revoke function if we are doing full data
-                                ext4_fsblk_t blocknr, struct buffer_head *bh)
+         * journaling: there is no need to, and a V1 superblock won't
-{
+         * support it.  Otherwise, only skip the revoke on un-journaled
-        int err = 0;
+         * data blocks. */
-        if (ext4_handle_valid(handle)) {
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-                err = jbd2_journal_revoke(handle, blocknr, bh);
+            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (err)
+                if (bh) {
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                                                  handle, err);
+                        err = jbd2_journal_forget(handle, bh);
+                        if (err)
+                                ext4_journal_abort_handle(where, __func__, bh,
+                                                          handle, err);
+                        return err;
+                }
+                return 0;
        }
-        else
-                bforget(bh);
+        /*
+         * data!=journal && (is_metadata || should_journal_data(inode))
+         */
+        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+        err = jbd2_journal_revoke(handle, blocknr, bh);
+        if (err) {
+                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+                ext4_abort(inode->i_sb, __func__,
+                           "error %d when attempting revoke", err);
+        }
+        BUFFER_TRACE(bh, "exit");
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..05eca817d704 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
 #define EXT4_DATA_TRANS_BLOCKS(sb)      (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
-                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /*
 * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
 * This include super block, inode block, quota blocks and xattr blocks
 */
 #define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
-                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
@@ -92,6 +92,7 @@
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
                (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
@@ -99,6 +100,9 @@
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 int
 ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
- * Wrapper functions with which ext4 calls into JBD.  The intent here is
+ * Wrapper functions with which ext4 calls into JBD.
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs.  This work hasn't
- * been done yet.
 */
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh);
-/* When called with an invalid handle, this will still do a put on the BH */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
-int __ext4_journal_forget(const char *where, handle_t *handle,
+                  struct inode *inode, struct buffer_head *bh,
-                                struct buffer_head *bh);
+                  ext4_fsblk_t blocknr);
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-                                ext4_fsblk_t blocknr, struct buffer_head *bh);
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+        __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+                      (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
-        __ext4_journal_forget(__func__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
        return 0;
 }
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+                                                 struct inode *inode,
+                                                 int datasync)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                if (datasync)
+                        ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
+}
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..3a7928f825e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1007,7 +1007,8 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);
@@ -1761,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
        while (block < last && block != EXT_MAX_BLOCK) {
                num = last - block;
                /* find extent for this block */
+                down_read(&EXT4_I(inode)->i_data_sem);
                path = ext4_ext_find_extent(inode, block, path);
+                up_read(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        path = NULL;
@@ -1957,7 +1960,6 @@ errout:
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path)
 {
-        struct buffer_head *bh;
        int err;
        ext4_fsblk_t leaf;
@@ -1973,9 +1975,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        bh = sb_find_get_block(inode->i_sb, leaf);
+        ext4_free_blocks(handle, inode, 0, leaf, 1,
-        ext4_forget(handle, 1, inode, bh, leaf);
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
-        ext4_free_blocks(handle, inode, leaf, 1, 1);
        return err;
 }
@@ -2042,12 +2043,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_extent *ex,
                                ext4_lblk_t from, ext4_lblk_t to)
 {
-        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-        int i, metadata = 0;
+        int flags = EXT4_FREE_BLOCKS_FORGET;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                metadata = 1;
+                flags |= EXT4_FREE_BLOCKS_METADATA;
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2072,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                for (i = 0; i < num; i++) {
+                ext4_free_blocks(handle, inode, 0, start, num, flags);
-                        bh = sb_find_get_block(inode->i_sb, start + i);
-                        ext4_forget(handle, 0, inode, bh, start + i);
-                }
-                ext4_free_blocks(handle, inode, start, num, metadata);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2167,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                if (err)
@@ -3064,6 +3060,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
                ret = ext4_convert_unwritten_extents_dio(handle, inode,
                                                        path);
+                if (ret >= 0)
+                        ext4_update_inode_fsync_trans(handle, inode, 1);
                goto out2;
        }
        /* buffered IO case */
@@ -3091,6 +3089,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode,
                                                path, iblock,
                                                max_blocks);
+        if (ret >= 0)
+                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
        if (ret <= 0) {
                err = ret;
@@ -3319,8 +3319,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
-                                        ext4_ext_get_actual_len(&newex), 0);
+                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
@@ -3329,10 +3329,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        allocated = ext4_ext_get_actual_len(&newex);
        set_buffer_new(bh_result);
-        /* Cache only when it is _not_ an uninitialized extent */
+        /*
-        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+         * Cache the extent and update transaction to commit on fdatasync only
+         * when it is _not_ an uninitialized extent.
+         */
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
+                ext4_update_inode_fsync_trans(handle, inode, 1);
+        } else
+                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
        if (allocated > max_blocks)
                allocated = max_blocks;
@@ -3720,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..0b22497d92e1 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int err, ret = 0;
+        int ret;
+        tid_t commit_tid;
        J_ASSERT(ext4_journal_current_handle() == NULL);
        trace_ext4_sync_file(file, dentry, datasync);
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
        ret = flush_aio_dio_completed_IO(inode);
        if (ret < 0)
-                goto out;
+                return ret;
+        
+        if (!journal)
+                return simple_fsync(file, dentry, datasync);
        /*
-         * data=writeback:
+         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
-         *  sync_inode() will sync the metadata
+         *  Metadata is in the journal, we wait for proper transaction to
-         *
+         *  commit here.
-         * data=ordered:
-         *  The caller's filemap_fdatawrite() will write the data and
-         *  sync_inode() will write the inode if it is dirty.  Then the caller's
-         *  filemap_fdatawait() will wait on the pages.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode)) {
+        if (ext4_should_journal_data(inode))
-                ret = ext4_force_commit(inode->i_sb);
+                return ext4_force_commit(inode->i_sb);
-                goto out;
-        }
-        if (!journal)
+        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-                ret = sync_mapping_buffers(inode->i_mapping);
+        if (jbd2_log_start_commit(journal, commit_tid))
+                jbd2_log_wait_commit(journal, commit_tid);
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+        else if (journal->j_flags & JBD2_BARRIER)
-                goto out;
-        /*
-         * The VFS has written the file data.  If the inode is unaltered
-         * then we need not start a commit.
-         */
-        if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = 0, /* sys_fsync did this */
-                };
-                err = sync_inode(inode, &wbc);
-                if (ret == 0)
-                        ret = err;
-        }
-out:
-        if (journal && (journal->j_flags & JBD2_BARRIER))
                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..5352db1a3086 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 /*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
-        int err;
-        might_sleep();
-        BUFFER_TRACE(bh, "enter");
-        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %x\n",
-                  bh, is_metadata, inode->i_mode,
-                  test_opt(inode->i_sb, DATA_FLAGS));
-        /* Never use the revoke function if we are doing full data
-         * journaling: there is no need to, and a V1 superblock won't
-         * support it.  Otherwise, only skip the revoke on un-journaled
-         * data blocks. */
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (bh) {
-                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                        return ext4_journal_forget(handle, bh);
-                }
-                return 0;
-        }
-        /*
-         * data!=journal && (is_metadata || should_journal_data(inode))
-         */
-        BUFFER_TRACE(bh, "call ext4_journal_revoke");
-        err = ext4_journal_revoke(handle, blocknr, bh);
-        if (err)
-                ext4_abort(inode->i_sb, __func__,
-                           "error %d when attempting revoke", err);
-        BUFFER_TRACE(bh, "exit");
-        return err;
-}
-/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
@@ -721,7 +669,7 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
        return ret;
 }
@@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
+        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, branch[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        for (i = 0; i < indirect_blks; i++)
+        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
        return err;
 }
@@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+                /* 
-                ext4_journal_forget(handle, where[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
-                ext4_free_blocks(handle, inode,
+                 * need to revoke the block, which is why we don't
-                                        le32_to_cpu(where[i-1].key), 1, 0);
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                         blks, 0);
        return err;
 }
@@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                         partial, indirect_blks, count);
-        else
+        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
+        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
@@ -1052,7 +1012,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
                EXT4_I(inode)->i_reserved_meta_blocks;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return total;
+        return (total << inode->i_blkbits);
 }
 /*
 * Calculate the number of metadata blocks need to reserve
@@ -1534,6 +1494,16 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext4_journal_get_write_access(handle, bh);
 }
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext4_truncate(inode);
+}
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -1599,7 +1569,7 @@ retry:
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
@@ -1709,7 +1679,7 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1721,7 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -1814,7 +1784,7 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                ext4_truncate(inode);
+                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@ -2600,7 +2570,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 }
 static int __ext4_journalled_writepage(struct page *page,
-                                       struct writeback_control *wbc,
                                       unsigned int len)
 {
        struct address_space *mapping = page->mapping;
@@ -2758,7 +2727,7 @@ static int ext4_writepage(struct page *page,
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc, len);
+                return __ext4_journalled_writepage(page, len);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2788,7 +2757,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2933,7 +2902,7 @@ retry:
                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                        &mpd);
                /*
-                 * If we have a contigous extent of pages and we
+                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
@@ -3091,7 +3060,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        ext4_truncate(inode);
+                        ext4_truncate_failed_write(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -4064,7 +4033,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4120,6 +4089,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                              __le32 *last)
 {
        __le32 *p;
+        int     flags = EXT4_FREE_BLOCKS_FORGET;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4108,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                }
        }
-        /*
+        for (p = first; p < last; p++)
-         * Any buffers which are on the journal will be in memory. We
+                *p = 0;
-         * find them on the hash table so jbd2_journal_revoke() will
-         * run jbd2_journal_forget() on them.  We've already detached
-         * each block from the file, so bforget() in
-         * jbd2_journal_forget() should be safe.
-         *
-         * AKPM: turn on bforget in jbd2_journal_forget()!!!
-         */
-        for (p = first; p < last; p++) {
-                u32 nr = le32_to_cpu(*p);
-                if (nr) {
-                        struct buffer_head *tbh;
-                        *p = 0;
-                        tbh = sb_find_get_block(inode->i_sb, nr);
-                        ext4_forget(handle, 0, inode, tbh, nr);
-                }
-        }
-        ext4_free_blocks(handle, inode, block_to_free, count, 0);
+        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
 }
 /**
@@ -4342,7 +4299,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                            blocks_for_truncate(inode));
                        }
-                        ext4_free_blocks(handle, inode, nr, 1, 1);
+                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                                         EXT4_FREE_BLOCKS_METADATA);
                        if (parent_bh) {
                                /*
@@ -4781,8 +4739,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
-        struct buffer_head *bh;
        struct inode *inode;
+        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
@@ -4793,11 +4751,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
+        iloc.bh = 0;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
                goto bad_inode;
-        bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4820,7 +4778,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                        brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
@@ -4848,11 +4805,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                spin_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                spin_unlock(&journal->j_state_lock);
+                ei->i_sync_tid = tid;
+                ei->i_datasync_tid = tid;
+        }
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                        brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
@@ -4884,10 +4865,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl <
+            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
-               EXT4_SB(sb)->s_gdb_count)) ||
-             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
                ext4_error(sb, __func__,
                           "bad extended attribute block %llu in inode #%lu",
                           ei->i_file_acl, inode->i_ino);
@@ -4905,10 +4883,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
-        if (ret) {
+        if (ret)
-                brelse(bh);
                goto bad_inode;
-        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -4936,7 +4912,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
-                brelse(bh);
                ret = -EIO;
                ext4_error(inode->i_sb, __func__,
                           "bogus i_mode (%o) for inode=%lu",
@@ -4949,6 +4924,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        return inode;
 bad_inode:
+        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
 }
@@ -5108,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
+        ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
@@ -5227,8 +5204,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -5376,7 +5353,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5429,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..b63d193126db 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,31 +221,38 @@ setversion_out:
                struct file *donor_filp;
                int err;
+                if (!(filp->f_mode & FMODE_READ) ||
+                    !(filp->f_mode & FMODE_WRITE))
+                        return -EBADF;
                if (copy_from_user(&me,
                        (struct move_extent __user *)arg, sizeof(me)))
                        return -EFAULT;
+                me.moved_len = 0;
                donor_filp = fget(me.donor_fd);
                if (!donor_filp)
                        return -EBADF;
-                if (!capable(CAP_DAC_OVERRIDE)) {
+                if (!(donor_filp->f_mode & FMODE_WRITE)) {
-                        if ((current->real_cred->fsuid != inode->i_uid) ||
+                        err = -EBADF;
-                                !(inode->i_mode & S_IRUSR) ||
+                        goto mext_out;
-                                !(donor_filp->f_dentry->d_inode->i_mode &
-                                S_IRUSR)) {
-                                fput(donor_filp);
-                                return -EACCES;
-                        }
                }
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        goto mext_out;
                err = ext4_move_extents(filp, donor_filp, me.orig_start,
                                        me.donor_start, me.len, &me.moved_len);
-                fput(donor_filp);
+                mnt_drop_write(filp->f_path.mnt);
+                if (me.moved_len > 0)
+                        file_remove_suid(donor_filp);
                if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
-                        return -EFAULT;
+                        err = -EFAULT;
+mext_out:
+                fput(donor_filp);
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..c1e19d5b5985 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
-        ext4_fsblk_t discard_block;
        struct list_head *l, *ltmp;
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+                if (test_opt(sb, DISCARD)) {
-                        + entry->start_blk
+                        ext4_fsblk_t discard_block;
-                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-                trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                                          entry->count);
+                        discard_block = (ext4_fsblk_t)entry->group *
-                sb_issue_discard(sb, discard_block, entry->count);
+                                                EXT4_BLOCKS_PER_GROUP(sb)
+                                        + entry->start_blk
+                                        + le32_to_cpu(es->s_first_data_block);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        sb_issue_discard(sb, discard_block, entry->count);
+                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
@@ -3006,6 +3011,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 }
 /*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+        struct ext4_prealloc_space *pa = ac->ac_pa;
+        int len;
+        if (pa && pa->pa_type == MB_INODE_PA) {
+                len = ac->ac_b_ex.fe_len;
+                pa->pa_free += len;
+        }
+}
+/*
 * use blocks preallocated to inode
 */
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -4290,6 +4313,7 @@ repeat:
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
                } else if (*errp) {
+                        ext4_discard_allocated_blocks(ac);
                        ac->ac_b_ex.fe_len = 0;
                        ar->len = 0;
                        ext4_mb_show_ac(ac);
@@ -4422,18 +4446,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        return 0;
 }
-/*
+/**
- * Main entry point into mballoc to free blocks
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:             handle for this transaction
+ * @inode:              inode
+ * @block:              start physical block to free
+ * @count:              number of blocks to count
+ * @metadata:           Are these metadata blocks
 */
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t block, unsigned long count,
+                      struct buffer_head *bh, ext4_fsblk_t block,
-                        int metadata, unsigned long *freed)
+                      unsigned long count, int flags)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
+        unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
@@ -4443,13 +4473,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        int err = 0;
        int ret;
-        *freed = 0;
+        if (bh) {
+                if (block)
+                        BUG_ON(block != bh->b_blocknr);
+                else
+                        block = bh->b_blocknr;
+        }
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-        if (block < le32_to_cpu(es->s_first_data_block) ||
+        if (!ext4_data_block_valid(sbi, block, count)) {
-            block + count < block ||
-            block + count > ext4_blocks_count(es)) {
                ext4_error(sb, __func__,
                            "Freeing blocks not in datazone - "
                            "block = %llu, count = %lu", block, count);
@@ -4457,7 +4490,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        }
        ext4_debug("freeing block %llu\n", block);
-        trace_ext4_free_blocks(inode, block, count, metadata);
+        trace_ext4_free_blocks(inode, block, count, flags);
+        if (flags & EXT4_FREE_BLOCKS_FORGET) {
+                struct buffer_head *tbh = bh;
+                int i;
+                BUG_ON(bh && (count > 1));
+                for (i = 0; i < count; i++) {
+                        if (!bh)
+                                tbh = sb_find_get_block(inode->i_sb,
+                                                        block + i);
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                                    inode, tbh, block + i);
+                }
+        }
+        /* 
+         * We need to make sure we don't reuse the freed block until
+         * after the transaction is committed, which we can do by
+         * treating the block as metadata, below.  We make an
+         * exception if the inode is to be written in writeback mode
+         * since writeback mode has weak data consistency guarantees.
+         */
+        if (!ext4_should_writeback_data(inode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4533,7 +4591,8 @@ do_more:
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_return;
-        if (metadata && ext4_handle_valid(handle)) {
+        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                struct ext4_free_data *new_entry;
                /*
                 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4631,7 @@ do_more:
        ext4_mb_release_desc(&e4b);
-        *freed += count;
+        freed += count;
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4651,8 @@ do_more:
        }
        sb->s_dirt = 1;
 error_return:
+        if (freed)
+                vfs_dq_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..81415814b00b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
         * So allocate a credit of 3. We may update
         * quota (user and group).
         */
-        needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+        needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
        if (ext4_journal_extend(handle, needed) != 0)
                retval = ext4_journal_restart(handle, needed);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode,
+                        ext4_free_blocks(handle, inode, 0,
-                                        le32_to_cpu(tmp_idata[i]), 1, 1);
+                                         le32_to_cpu(tmp_idata[i]), 1,
+                                         EXT4_FREE_BLOCKS_METADATA |
+                                         EXT4_FREE_BLOCKS_FORGET);
                }
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+                         EXT4_FREE_BLOCKS_METADATA |
+                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
 }
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode,
+                ext4_free_blocks(handle, inode, 0,
-                                le32_to_cpu(i_data[0]), 1, 1);
+                                le32_to_cpu(i_data[0]), 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
        /* ei->i_data[EXT4_DIND_BLOCK] */
@@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, block, 1, 1);
+        ext4_free_blocks(handle, inode, 0, block, 1,
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
        handle = ext4_journal_start(inode,
                                        EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
                                        + 1);
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..82c415be87a4 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
+        struct ext4_extent_header *eh;
        int ppos, leaf_ppos = path->p_depth;
        ppos = leaf_ppos;
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
+                path[ppos].p_block = ext_pblock(path[ppos].p_ext);
                return 0;
        }
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                        ext_block_hdr(path[cur_ppos+1].p_bh);
                        }
+                        path[leaf_ppos].p_ext = *extent = NULL;
+                        eh = path[leaf_ppos].p_hdr;
+                        if (le16_to_cpu(eh->eh_entries) == 0)
+                                /* empty leaf is found */
+                                return -ENODATA;
                        /* leaf block */
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                        path[leaf_ppos].p_block =
+                                        ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 }
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
- *
- * @orig_inode:         original inode structure
- * @donor_inode:        donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-        struct inode *first = orig_inode, *second = donor_inode;
-        /*
-         * Use the inode number to provide the stable locking order instead
-         * of its address, because the C language doesn't guarantee you can
-         * compare pointers that don't come from the same array.
-         */
-        if (donor_inode->i_ino < orig_inode->i_ino) {
-                first = donor_inode;
-                second = orig_inode;
-        }
-        down_read(&EXT4_I(first)->i_data_sem);
-        down_read(&EXT4_I(second)->i_data_sem);
-}
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
 *
 * @orig_inode:         original inode structure
 * @donor_inode:        donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
 */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
        }
        down_write(&EXT4_I(first)->i_data_sem);
-        down_write(&EXT4_I(second)->i_data_sem);
+        down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
 }
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:         original inode structure to be released its lock first
 * @donor_inode:        donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
 */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
-{
-        up_read(&EXT4_I(orig_inode)->i_data_sem);
-        up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:         original inode structure to be released its lock first
- * @donor_inode:        donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -596,7 +568,7 @@ out:
 * @tmp_oext:           the extent that will belong to the donor inode
 * @orig_off:           block offset of original inode
 * @donor_off:          block offset of donor inode
- * @max_count:          the maximun length of extents
+ * @max_count:          the maximum length of extents
 *
 * Return 0 on success, or a negative error value on failure.
 */
@@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * @donor_inode:        donor inode
 * @from:               block offset of orig_inode
 * @count:              block count to be replaced
+ * @err:                pointer to save return value
 *
 * Replace original inode extents and donor inode extents page by page.
 * We implement this replacement in the following three steps:
@@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 * 3. Change the block information of donor inode to point at the saved
 *    original inode blocks in the dummy extents.
 *
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
 */
 static int
 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                           struct inode *donor_inode, ext4_lblk_t from,
-                           ext4_lblk_t count)
+                           ext4_lblk_t count, int *err)
 {
        struct ext4_ext_path *orig_path = NULL;
        struct ext4_ext_path *donor_path = NULL;
        struct ext4_extent *oext, *dext;
        struct ext4_extent tmp_dext, tmp_oext;
        ext4_lblk_t orig_off = from, donor_off = from;
-        int err = 0;
        int depth;
        int replaced_count = 0;
        int dext_alen;
-        mext_double_down_write(orig_inode, donor_inode);
+        /* Protect extent trees against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Get the original extent for the block "orig_off" */
-        err = get_ext_path(orig_inode, orig_off, &orig_path);
+        *err = get_ext_path(orig_inode, orig_off, &orig_path);
-        if (err)
+        if (*err)
                goto out;
        /* Get the donor extent for the head */
-        err = get_ext_path(donor_inode, donor_off, &donor_path);
+        *err = get_ext_path(donor_inode, donor_off, &donor_path);
-        if (err)
+        if (*err)
                goto out;
        depth = ext_depth(orig_inode);
        oext = orig_path[depth].p_ext;
@@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        dext = donor_path[depth].p_ext;
        tmp_dext = *dext;
-        err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+        *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                      donor_off, count);
-        if (err)
+        if (*err)
                goto out;
        /* Loop for the donor extents */
@@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (!dext) {
                        ext4_error(donor_inode->i_sb, __func__,
                                   "The extent for donor must be found");
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
                        ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                                "extent(%u) should be equal",
                                donor_off,
                                le32_to_cpu(tmp_dext.ee_block));
-                        err = -EIO;
+                        *err = -EIO;
                        goto out;
                }
                /* Set donor extent to orig extent */
-                err = mext_leaf_block(handle, orig_inode,
+                *err = mext_leaf_block(handle, orig_inode,
                                           orig_path, &tmp_dext, &orig_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                /* Set orig extent to donor extent */
-                err = mext_leaf_block(handle, donor_inode,
+                *err = mext_leaf_block(handle, donor_inode,
                                           donor_path, &tmp_oext, &donor_off);
-                if (err < 0)
+                if (*err)
                        goto out;
                dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (orig_path)
                        ext4_ext_drop_refs(orig_path);
-                err = get_ext_path(orig_inode, orig_off, &orig_path);
+                *err = get_ext_path(orig_inode, orig_off, &orig_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(orig_inode);
                oext = orig_path[depth].p_ext;
-                if (le32_to_cpu(oext->ee_block) +
-                                ext4_ext_get_actual_len(oext) <= orig_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_oext = *oext;
                if (donor_path)
                        ext4_ext_drop_refs(donor_path);
-                err = get_ext_path(donor_inode, donor_off, &donor_path);
+                *err = get_ext_path(donor_inode, donor_off, &donor_path);
-                if (err)
+                if (*err)
                        goto out;
                depth = ext_depth(donor_inode);
                dext = donor_path[depth].p_ext;
-                if (le32_to_cpu(dext->ee_block) +
-                                ext4_ext_get_actual_len(dext) <= donor_off) {
-                        err = 0;
-                        goto out;
-                }
                tmp_dext = *dext;
-                err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                           donor_off, count - replaced_count);
-                if (err)
+                if (*err)
                        goto out;
        }
@@ -795,8 +758,12 @@ out:
                kfree(donor_path);
        }
-        mext_double_up_write(orig_inode, donor_inode);
+        ext4_ext_invalidate_cache(orig_inode);
-        return err;
+        ext4_ext_invalidate_cache(donor_inode);
+        double_up_write_data_sem(orig_inode, donor_inode);
+        return replaced_count;
 }
 /**
@@ -808,16 +775,17 @@ out:
 * @data_offset_in_page:        block index where data swapping starts
 * @block_len_in_page:          the number of blocks to be swapped
 * @uninit:                     orig extent is uninitialized or not
+ * @err:                        pointer to save return value
 *
 * Save the data in original inode blocks and replace original inode extents
 * with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
+ * Finally, write out the saved data in new original inode blocks. Return
- * on success, or a negative error value on failure.
+ * replaced block count.
 */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                  pgoff_t orig_page_offset, int data_offset_in_page,
-                  int block_len_in_page, int uninit)
+                  int block_len_in_page, int uninit, int *err)
 {
        struct inode *orig_inode = o_filp->f_dentry->d_inode;
        struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int w_flags = 0;
-        unsigned int tmp_data_len, data_len;
+        unsigned int tmp_data_size, data_size, replaced_size;
        void *fsdata;
-        int ret, i, jblocks;
+        int i, jblocks;
+        int err2 = 0;
+        int replaced_count = 0;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
        /*
@@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
        handle = ext4_journal_start(orig_inode, jblocks);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                *err = PTR_ERR(handle);
-                return ret;
+                return 0;
        }
        if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * Just swap data blocks between orig and donor.
         */
        if (uninit) {
-                ret = mext_replace_branches(handle, orig_inode,
+                replaced_count = mext_replace_branches(handle, orig_inode,
-                                                 donor_inode, orig_blk_offset,
+                                                donor_inode, orig_blk_offset,
-                                                 block_len_in_page);
+                                                block_len_in_page, err);
-                /* Clear the inode cache not to refer to the old data */
-                ext4_ext_invalidate_cache(orig_inode);
-                ext4_ext_invalidate_cache(donor_inode);
                goto out2;
        }
        offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-        /* Calculate data_len */
+        /* Calculate data_size */
        if ((orig_blk_offset + block_len_in_page - 1) ==
            ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
                /* Replace the last block */
-                tmp_data_len = orig_inode->i_size & (blocksize - 1);
+                tmp_data_size = orig_inode->i_size & (blocksize - 1);
                /*
-                 * If data_len equal zero, it shows data_len is multiples of
+                 * If data_size equal zero, it shows data_size is multiples of
                 * blocksize. So we set appropriate value.
                 */
-                if (tmp_data_len == 0)
+                if (tmp_data_size == 0)
-                        tmp_data_len = blocksize;
+                        tmp_data_size = blocksize;
-                data_len = tmp_data_len +
+                data_size = tmp_data_size +
                        ((block_len_in_page - 1) << orig_inode->i_blkbits);
-        } else {
+        } else
-                data_len = block_len_in_page << orig_inode->i_blkbits;
+                data_size = block_len_in_page << orig_inode->i_blkbits;
-        }
+        replaced_size = data_size;
-        ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+        *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
                                 &page, &fsdata);
-        if (unlikely(ret < 0))
+        if (unlikely(*err < 0))
                goto out;
        if (!PageUptodate(page)) {
@@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
-        ret = mext_replace_branches(handle, orig_inode, donor_inode,
+        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-                                         orig_blk_offset, block_len_in_page);
+                                        orig_blk_offset, block_len_in_page,
-        if (ret < 0)
+                                        &err2);
-                goto out;
+        if (err2) {
+                if (replaced_count) {
-        /* Clear the inode cache not to refer to the old data */
+                        block_len_in_page = replaced_count;
-        ext4_ext_invalidate_cache(orig_inode);
+                        replaced_size =
-        ext4_ext_invalidate_cache(donor_inode);
+                                block_len_in_page << orig_inode->i_blkbits;
+                } else
+                        goto out;
+        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                bh = bh->b_this_page;
        for (i = 0; i < block_len_in_page; i++) {
-                ret = ext4_get_block(orig_inode,
+                *err = ext4_get_block(orig_inode,
                                (sector_t)(orig_blk_offset + i), bh, 0);
-                if (ret < 0)
+                if (*err < 0)
                        goto out;
                if (bh->b_this_page != NULL)
                        bh = bh->b_this_page;
        }
-        ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+        *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
                               page, fsdata);
        page = NULL;
@@ -951,7 +921,10 @@ out:
 out2:
        ext4_journal_stop(handle);
-        return ret < 0 ? ret : 0;
+        if (err2)
+                *err = err2;
+        return replaced_count;
 }
 /**
@@ -962,7 +935,6 @@ out2:
 * @orig_start:         logical start offset in block for orig
 * @donor_start:        logical start offset in block for donor
 * @len:                the number of blocks to be moved
- * @moved_len:          moved block length
 *
 * Check the arguments of ext4_move_extents() whether the files can be
 * exchanged with each other.
@@ -970,8 +942,8 @@ out2:
 */
 static int
 mext_check_arguments(struct inode *orig_inode,
-                          struct inode *donor_inode, __u64 orig_start,
+                     struct inode *donor_inode, __u64 orig_start,
-                          __u64 donor_start, __u64 *len, __u64 moved_len)
+                     __u64 donor_start, __u64 *len)
 {
        ext4_lblk_t orig_blocks, donor_blocks;
        unsigned int blkbits = orig_inode->i_blkbits;
@@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
+        if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+                ext4_debug("ext4 move extent: suid or sgid is set"
+                           " to donor file [ino:orig %lu, donor %lu]\n",
+                           orig_inode->i_ino, donor_inode->i_ino);
+                return -EINVAL;
+        }
        /* Ext4 move extent does not support swapfile */
        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
                ext4_debug("ext4 move extent: The argument files should "
@@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if (moved_len) {
-                ext4_debug("ext4 move extent: moved_len should be 0 "
-                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
-                        donor_inode->i_ino);
-                return -EINVAL;
-        }
        if ((orig_start > EXT_MAX_BLOCK) ||
            (donor_start > EXT_MAX_BLOCK) ||
            (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode,
        }
        if (!*len) {
-                ext4_debug("ext4 move extent: len shoudld not be 0 "
+                ext4_debug("ext4 move extent: len should not be 0 "
                        "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
                        donor_inode->i_ino);
                return -EINVAL;
@@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
-        /* protect orig and donor against a truncate */
+        /* Protect orig and donor inodes against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
                return ret1;
-        mext_double_down_read(orig_inode, donor_inode);
+        /* Protect extent tree against block allocations via delalloc */
+        double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
-                                        donor_start, &len, *moved_len);
+                                    donor_start, &len);
-        mext_double_up_read(orig_inode, donor_inode);
        if (ret1)
                goto out;
@@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                seq_start = le32_to_cpu(ext_cur->ee_block);
                rest_blocks = seq_blocks;
-                /* Discard preallocations of two inodes */
+                /*
-                down_write(&EXT4_I(orig_inode)->i_data_sem);
+                 * Up semaphore to avoid following problems:
-                ext4_discard_preallocations(orig_inode);
+                 * a. transaction deadlock among ext4_journal_start,
-                up_write(&EXT4_I(orig_inode)->i_data_sem);
+                 *    ->write_begin via pagefault, and jbd2_journal_commit
+                 * b. racing with ->readpage, ->write_begin, and ext4_get_block
-                down_write(&EXT4_I(donor_inode)->i_data_sem);
+                 *    in move_extent_per_page
-                ext4_discard_preallocations(donor_inode);
+                 */
-                up_write(&EXT4_I(donor_inode)->i_data_sem);
+                double_up_write_data_sem(orig_inode, donor_inode);
                while (orig_page_offset <= seq_end_page) {
                        /* Swap original branches with new branches */
-                        ret1 = move_extent_per_page(o_filp, donor_inode,
+                        block_len_in_page = move_extent_per_page(
+                                                o_filp, donor_inode,
                                                orig_page_offset,
                                                data_offset_in_page,
-                                                block_len_in_page, uninit);
+                                                block_len_in_page, uninit,
-                        if (ret1 < 0)
+                                                &ret1);
-                                goto out;
-                        orig_page_offset++;
                        /* Count how many blocks we have exchanged */
                        *moved_len += block_len_in_page;
+                        if (ret1 < 0)
+                                break;
                        if (*moved_len > len) {
                                ext4_error(orig_inode->i_sb, __func__,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
                                ret1 = -EIO;
-                                goto out;
+                                break;
                        }
+                        orig_page_offset++;
                        data_offset_in_page = 0;
                        rest_blocks -= block_len_in_page;
                        if (rest_blocks > blocks_per_page)
@@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
+                double_down_write_data_sem(orig_inode, donor_inode);
+                if (ret1 < 0)
+                        break;
                /* Decrease buffer counter */
                if (holecheck_path)
                        ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        }
 out:
+        if (*moved_len) {
+                ext4_discard_preallocations(orig_inode);
+                ext4_discard_preallocations(donor_inode);
+        }
        if (orig_path) {
                ext4_ext_drop_refs(orig_path);
                kfree(orig_path);
@@ -1422,7 +1406,7 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
+        double_up_write_data_sem(orig_inode, donor_inode);
        ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
        if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..17a17e10dd60 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
 */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-                                                  bh, offset)) {
+                                                  bh, offset))
-                                brelse(bh);
                                return -EIO;
-                        }
+                        if (ext4_match(namelen, name, de))
-                        if (ext4_match(namelen, name, de)) {
-                                brelse(bh);
                                return -EEXIST;
-                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_journal_get_write_access(handle, bh);
        if (err) {
                ext4_std_error(dir->i_sb, err);
-                brelse(bh);
                return err;
        }
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
-        brelse(bh);
        return 0;
 }
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        if (!(de))
                return retval;
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                if(!bh)
                        return retval;
                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-                if (retval != -ENOSPC)
+                if (retval != -ENOSPC) {
+                        brelse(bh);
                        return retval;
+                }
                if (blocks == 1 && !dx_fallback &&
                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        brelse(bh);
+        return retval;
 }
 /*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                goto journal_error;
        err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-        if (err != -ENOSPC) {
+        if (err != -ENOSPC)
-                bh = NULL;
                goto cleanup;
-        }
        /* Block full, should compress but for now just split */
        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        if (!de)
                goto cleanup;
        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-        bh = NULL;
        goto cleanup;
 journal_error:
@@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..3b2c5541d8a6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        goto exit_bh;
                if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                        err = PTR_ERR(bh);
+                        err = PTR_ERR(gdb);
                        goto exit_bh;
                }
                ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..8b58a144c31b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb)
        if (sb->s_dirt)
                ext4_commit_super(sb, 1);
-        ext4_release_system_zone(sb);
-        ext4_mb_release(sb);
-        ext4_ext_release(sb);
-        ext4_xattr_put_super(sb);
        if (sbi->s_journal) {
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb)
                        ext4_abort(sb, __func__,
                                   "Couldn't clean up the journal");
        }
+        ext4_release_system_zone(sb);
+        ext4_mb_release(sb);
+        ext4_ext_release(sb);
+        ext4_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -704,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        spin_lock_init(&(ei->i_block_reservation_lock));
        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
        ei->cur_aio_dio = NULL;
+        ei->i_sync_tid = 0;
+        ei->i_datasync_tid = 0;
        return &ei->vfs_inode;
 }
@@ -899,6 +903,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NO_AUTO_DA_ALLOC))
                seq_puts(seq, ",noauto_da_alloc");
+        if (test_opt(sb, DISCARD))
+                seq_puts(seq, ",discard");
+        if (test_opt(sb, NOLOAD))
+                seq_puts(seq, ",norecovery");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1079,7 +1089,8 @@ enum {
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_block_validity, Opt_noblock_validity,
-        Opt_inode_readahead_blks, Opt_journal_ioprio
+        Opt_inode_readahead_blks, Opt_journal_ioprio,
+        Opt_discard, Opt_nodiscard,
 };
 static const match_table_t tokens = {
@@ -1104,6 +1115,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_noload, "noload"},
+        {Opt_noload, "norecovery"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
@@ -1144,6 +1156,8 @@ static const match_table_t tokens = {
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc"},
        {Opt_noauto_da_alloc, "noauto_da_alloc"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL},
 };
@@ -1565,6 +1579,12 @@ set_qf_format:
                        else
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
+                case Opt_discard:
+                        set_opt(sbi->s_mount_opt, DISCARD);
+                        break;
+                case Opt_nodiscard:
+                        clear_opt(sbi->s_mount_opt, DISCARD);
+                        break;
                default:
                        ext4_msg(sb, KERN_ERR,
                               "Unrecognized mount option \"%s\" "
@@ -1673,14 +1693,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
        size_t size;
        int i;
-        if (!sbi->s_es->s_log_groups_per_flex) {
+        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        if (groups_per_flex < 2) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
-        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
        /* We allocate both existing and potentially added groups */
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2721,26 +2741,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext4_load_journal(sb, es, journal_devnum))
                        goto failed_mount3;
-                if (!(sb->s_flags & MS_RDONLY) &&
-                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        ext4_msg(sb, KERN_CRIT, "error: "
-                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt",
-                               EXT4_SB(sb)->s_journal->j_failed_commit);
-                        if (test_opt(sb, ERRORS_RO)) {
-                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only");
-                                sb->s_flags |= MS_RDONLY;
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                        }
-                        if (test_opt(sb, ERRORS_PANIC)) {
-                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, 1);
-                                goto failed_mount4;
-                        }
-                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -3668,13 +3668,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
-        ext4_free_blocks_count_set(es, buf->f_bfree);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT4_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3966,6 +3964,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
+#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext2",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static inline void register_as_ext2(void)
+{
+        int err = register_filesystem(&ext2_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+static inline void unregister_as_ext2(void)
+{
+        unregister_filesystem(&ext2_fs_type);
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+#endif
+#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext3",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static inline void register_as_ext3(void)
+{
+        int err = register_filesystem(&ext3_fs_type);
+        if (err)
+                printk(KERN_WARNING
+                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+static inline void unregister_as_ext3(void)
+{
+        unregister_filesystem(&ext3_fs_type);
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+#endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3995,11 +4045,15 @@ static int __init init_ext4_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
+        register_as_ext2();
+        register_as_ext3();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
        return 0;
 out:
+        unregister_as_ext2();
+        unregister_as_ext3();
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
@@ -4015,6 +4069,8 @@ out4:
 static void __exit exit_ext4_fs(void)
 {
+        unregister_as_ext2();
+        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..910bf9a59cb3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
-                ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
                get_bh(bh);
-                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+                ext4_free_blocks(handle, inode, bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_METADATA |
+                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
                error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -832,7 +833,8 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, block, 1, 1);
+                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
                        }
@@ -988,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (error)
                goto cleanup;
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto cleanup;
        if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1013,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }
-        error = ext4_journal_get_write_access(handle, is.iloc.bh);
-        if (error)
-                goto cleanup;
        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e009..4bef4c01ec6f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -280,7 +279,6 @@ void __fput(struct file *file)
        if (file->f_op && file->f_op->release)
                file->f_op->release(inode, file);
        security_file_free(file);
-        ima_file_free(file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9d5360c4c2af..49bc1b8e8f19 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                                struct writeback_control *wbc)
 {
        struct super_block *sb = wbc->sb, *pin_sb = NULL;
-        const int is_blkdev_sb = sb_is_blkdev_sb(sb);
        const unsigned long start = jiffies;    /* livelock avoidance */
        spin_lock(&inode_lock);
@@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                        continue;
                }
-                if (!bdi_cap_writeback_dirty(wb->bdi)) {
-                        redirty_tail(inode);
-                        if (is_blkdev_sb) {
-                                /*
-                                 * Dirty memory-backed blockdev: the ramdisk
-                                 * driver does this.  Skip just this inode
-                                 */
-                                continue;
-                        }
-                        /*
-                         * Dirty memory-backed inode against a filesystem other
-                         * than the kernel-internal bdev filesystem.  Skip the
-                         * entire superblock.
-                         */
-                        break;
-                }
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
-                if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
-                        wbc->encountered_congestion = 1;
-                        if (!is_blkdev_sb)
-                                break;          /* Skip a congested fs */
-                        requeue_io(inode);
-                        continue;               /* Skip a congested blockdev */
-                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
                .for_kupdate            = args->for_kupdate,
+                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
        };
        unsigned long oldest_jif;
@@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
                wbc.more_io = 0;
-                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
                writeback_inodes_wb(wb, &wbc);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d2090..4dcddf83326f 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,8 @@ config GFS2_FS
        select FS_POSIX_ACL
        select CRC32
        select SLOW_WORK
+        select QUOTA
+        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d84..3eb1ea846173 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
 #include "trans.h"
 #include "util.h"
-#define ACL_ACCESS 1
+static const char *gfs2_acl_name(int type)
-#define ACL_DEFAULT 0
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er, int *remove, mode_t *mode)
 {
-        struct posix_acl *acl;
+        switch (type) {
-        int error;
+        case ACL_TYPE_ACCESS:
+                return GFS2_POSIX_ACL_ACCESS;
-        error = gfs2_acl_validate_remove(ip, access);
+        case ACL_TYPE_DEFAULT:
-        if (error)
+                return GFS2_POSIX_ACL_DEFAULT;
-                return error;
-        if (!er->er_data)
-                return -EINVAL;
-        acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        if (!acl) {
-                *remove = 1;
-                return 0;
-        }
-        error = posix_acl_valid(acl);
-        if (error)
-                goto out;
-        if (access) {
-                error = posix_acl_equiv_mode(acl, mode);
-                if (!error)
-                        *remove = 1;
-                else if (error > 0)
-                        error = 0;
        }
+        return NULL;
-out:
-        posix_acl_release(acl);
-        return error;
-}
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
-        if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
-                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(&ip->i_inode))
-                return -EPERM;
-        if (S_ISLNK(ip->i_inode.i_mode))
-                return -EOPNOTSUPP;
-        if (!access && !S_ISDIR(ip->i_inode.i_mode))
-                return -EACCES;
-        return 0;
 }
-static int acl_get(struct gfs2_inode *ip, const char *name,
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
-                   struct posix_acl **acl, struct gfs2_ea_location *el,
-                   char **datap, unsigned int *lenp)
 {
+        struct posix_acl *acl;
+        const char *name;
        char *data;
-        unsigned int len;
+        int len;
-        int error;
-        el->el_bh = NULL;
        if (!ip->i_eattr)
-                return 0;
+                return NULL;
-        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-        if (error)
-                return error;
-        if (!el->el_ea)
-                return 0;
-        if (!GFS2_EA_DATA_LEN(el->el_ea))
-                goto out;
-        len = GFS2_EA_DATA_LEN(el->el_ea);
+        acl = get_cached_acl(&ip->i_inode, type);
-        data = kmalloc(len, GFP_NOFS);
+        if (acl != ACL_NOT_CACHED)
-        error = -ENOMEM;
+                return acl;
-        if (!data)
-                goto out;
-        error = gfs2_ea_get_copy(ip, el, data, len);
+        name = gfs2_acl_name(type);
-        if (error < 0)
+        if (name == NULL)
-                goto out_kfree;
+                return ERR_PTR(-EINVAL);
-        error = 0;
-        if (acl) {
+        len = gfs2_xattr_acl_get(ip, name, &data);
-                *acl = posix_acl_from_xattr(data, len);
+        if (len < 0)
-                if (IS_ERR(*acl))
+                return ERR_PTR(len);
-                        error = PTR_ERR(*acl);
+        if (len == 0)
-        }
+                return NULL;
-out_kfree:
+        acl = posix_acl_from_xattr(data, len);
-        if (error || !datap) {
+        kfree(data);
-                kfree(data);
+        return acl;
-        } else {
-                *datap = data;
-                *lenp = len;
-        }
-out:
-        return error;
 }
 /**
@@ -140,14 +77,12 @@ out:
 int gfs2_check_acl(struct inode *inode, int mask)
 {
-        struct gfs2_ea_location el;
+        struct posix_acl *acl;
-        struct posix_acl *acl = NULL;
        int error;
-        error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (acl) {
                error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error = 0;
-        struct buffer_head *dibh;
-        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (mode != inode->i_mode) {
-        if (error)
+                struct iattr iattr;
-                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+                iattr.ia_valid = ATTR_MODE;
-        if (!error) {
+                iattr.ia_mode = mode;
-                gfs2_assert_withdraw(sdp,
-                                (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
+                error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
-                ip->i_inode.i_mode = mode;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                gfs2_dinode_out(ip, dibh->b_data);
-                brelse(dibh);
        }
-        gfs2_trans_end(sdp);
+        return error;
+}
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+        int error;
+        int len;
+        char *data;
+        const char *name = gfs2_acl_name(type);
-        return 0;
+        BUG_ON(name == NULL);
+        len = posix_acl_to_xattr(acl, NULL, 0);
+        if (len == 0)
+                return 0;
+        data = kmalloc(len, GFP_NOFS);
+        if (data == NULL)
+                return -ENOMEM;
+        error = posix_acl_to_xattr(acl, data, len);
+        if (error < 0)
+                goto out;
+        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+        if (!error)
+                set_cached_acl(inode, type, acl);
+out:
+        kfree(data);
+        return error;
 }
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 {
-        struct gfs2_ea_location el;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        mode_t mode = ip->i_inode.i_mode;
+        mode_t mode = inode->i_mode;
-        char *data = NULL;
+        int error = 0;
-        unsigned int len;
-        int error;
        if (!sdp->sd_args.ar_posix_acl)
                return 0;
-        if (S_ISLNK(ip->i_inode.i_mode))
+        if (S_ISLNK(inode->i_mode))
                return 0;
-        error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
-        brelse(el.el_bh);
+        if (IS_ERR(acl))
-        if (error)
+                return PTR_ERR(acl);
-                return error;
        if (!acl) {
                mode &= ~current_umask();
-                if (mode != ip->i_inode.i_mode)
+                if (mode != inode->i_mode)
-                        error = munge_mode(ip, mode);
+                        error = gfs2_set_mode(inode, mode);
                return error;
        }
+        if (S_ISDIR(inode->i_mode)) {
+                error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+                if (error)
+                        goto out;
+        }
        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        posix_acl_release(acl);
        acl = clone;
-        if (S_ISDIR(ip->i_inode.i_mode)) {
-                error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                                       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-                if (error)
-                        goto out;
-        }
        error = posix_acl_create_masq(acl, &mode);
        if (error < 0)
                goto out;
        if (error == 0)
                goto munge;
-        posix_acl_to_xattr(acl, data, len);
+        error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
-        error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                               GFS2_POSIX_ACL_ACCESS, data, len, 0);
        if (error)
                goto out;
 munge:
-        error = munge_mode(ip, mode);
+        error = gfs2_set_mode(inode, mode);
 out:
        posix_acl_release(acl);
-        kfree(data);
        return error;
 }
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
-        struct posix_acl *acl = NULL, *clone;
+        struct posix_acl *acl, *clone;
-        struct gfs2_ea_location el;
        char *data;
        unsigned int len;
        int error;
-        error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
+        acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
-        if (error)
+        if (IS_ERR(acl))
-                goto out_brelse;
+                return PTR_ERR(acl);
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
@@ -265,15 +207,134 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        error = posix_acl_chmod_masq(acl, attr->ia_mode);
        if (!error) {
+                len = posix_acl_to_xattr(acl, NULL, 0);
+                data = kmalloc(len, GFP_NOFS);
+                error = -ENOMEM;
+                if (data == NULL)
+                        goto out;
                posix_acl_to_xattr(acl, data, len);
-                error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+                error = gfs2_xattr_acl_chmod(ip, attr, data);
+                kfree(data);
+                set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
        }
 out:
        posix_acl_release(acl);
-        kfree(data);
-out_brelse:
-        brelse(el.el_bh);
        return error;
 }
+static int gfs2_acl_type(const char *name)
+{
+        if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+                return ACL_TYPE_ACCESS;
+        if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+                return ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+                                 void *buffer, size_t size)
+{
+        struct posix_acl *acl;
+        int type;
+        int error;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        acl = gfs2_acl_get(GFS2_I(inode), type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+                                 const void *value, size_t size, int flags)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct posix_acl *acl = NULL;
+        int error = 0, type;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
+        type = gfs2_acl_type(name);
+        if (type < 0)
+                return type;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!value)
+                goto set_acl;
+        acl = posix_acl_from_xattr(value, size);
+        if (!acl) {
+                /*
+                 * acl_set_file(3) may request that we set default ACLs with
+                 * zero length -- defend (gracefully) against that here.
+                 */
+                goto out;
+        }
+        if (IS_ERR(acl)) {
+                error = PTR_ERR(acl);
+                goto out;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out_release;
+        error = -EINVAL;
+        if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+                goto out_release;
+        if (type == ACL_TYPE_ACCESS) {
+                mode_t mode = inode->i_mode;
+                error = posix_acl_equiv_mode(acl, &mode);
+                if (error <= 0) {
+                        posix_acl_release(acl);
+                        acl = NULL;
+                        if (error < 0)
+                                return error;
+                }
+                error = gfs2_set_mode(inode, mode);
+                if (error)
+                        goto out_release;
+        }
+set_acl:
+        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+        if (!error) {
+                if (acl)
+                        set_cached_acl(inode, type, acl);
+                else
+                        forget_cached_acl(inode, type);
+        }
+out_release:
+        posix_acl_release(acl);
+out:
+        return error;
+}
+struct xattr_handler gfs2_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = gfs2_xattr_system_get,
+        .set    = gfs2_xattr_system_set,
+};
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb64..9306a2e6620c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
 #include "incore.h"
 #define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN       16
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN      17
+#define GFS2_ACL_MAX_ENTRIES            25
-#define GFS2_ACL_IS_ACCESS(name, len) \
+extern int gfs2_check_acl(struct inode *inode, int mask);
-         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
-         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
-#define GFS2_ACL_IS_DEFAULT(name, len) \
-         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
-         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-struct gfs2_ea_request;
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                          struct gfs2_ea_request *er,
-                          int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f036..7b8da9415267 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
        unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int i;
        int ret;
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                if (ret || (--(wbc->nr_to_write) <= 0))
                        ret = 1;
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                        wbc->encountered_congestion = 1;
-                        ret = 1;
-                }
        }
        gfs2_trans_end(sdp);
        return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 static int gfs2_write_cache_jdata(struct address_space *mapping,
                                  struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
        int scanned = 0;
        int range_whole = 0;
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                mark_inode_dirty(inode);
        }
-        if (inode == sdp->sd_rindex)
+        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
+                ip->i_gh.gh_flags |= GL_NOCACHE;
+        }
        brelse(dibh);
        gfs2_trans_end(sdp);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5cebad..25fddc100f18 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
        return ERR_PTR(-EIO);
 }
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
-                        struct gfs2_dirent **dent)
-{
-        struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-        if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
-                if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_leaf));
-                return IS_LEAF;
-        } else {
-                if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
-                        return -EIO;
-                *dent = (struct gfs2_dirent *)(bh->b_data +
-                                               sizeof(struct gfs2_dinode));
-                return IS_DINODE;
-        }
-}
 static int dirent_check_reclen(struct gfs2_inode *dip,
                               const struct gfs2_dirent *d, const void *end_p)
 {
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        divider = (start + half_len) << (32 - dip->i_depth);
        /*  Copy the entries  */
-        dirent_first(dip, obh, &dent);
+        dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
        do {
                next = dent;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a55..f455a03a09e2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        int rv = 0;
        write_lock(gl_lock_addr(gl->gl_hash));
-        if (atomic_dec_and_test(&gl->gl_ref)) {
+        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
                hlist_del(&gl->gl_list);
-                write_unlock(gl_lock_addr(gl->gl_hash));
-                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
+                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
@@ -513,7 +512,6 @@ retry:
                        GLOCK_BUG_ON(gl, 1);
                }
                spin_unlock(&gl->gl_spin);
-                gfs2_glock_put(gl);
                return;
        }
@@ -524,8 +522,6 @@ retry:
                if (glops->go_xmote_bh) {
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
-                        if (rv == -EAGAIN)
-                                return;
                        spin_lock(&gl->gl_spin);
                        if (rv) {
                                do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
 out_locked:
        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
 }
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
        if (!(ret & LM_OUT_ASYNC)) {
                finish_xmote(gl, ret);
-                gfs2_glock_hold(gl);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
        } else {
@@ -672,12 +666,17 @@ out:
        return;
 out_sched:
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        smp_mb__after_clear_bit();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put_nolock(gl);
+        return;
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        goto out;
+        smp_mb__after_clear_bit();
+        return;
 }
 static void delete_work_func(struct work_struct *work)
@@ -707,9 +706,12 @@ static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+        int drop_ref = 0;
-        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+        if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
                finish_xmote(gl, gl->gl_reply);
+                drop_ref = 1;
+        }
        down_read(&gfs2_umount_flush_sem);
        spin_lock(&gl->gl_spin);
        if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -727,6 +729,8 @@ static void glock_work_func(struct work_struct *work)
        if (!delay ||
            queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
+        if (drop_ref)
+                gfs2_glock_put(gl);
 }
 /**
@@ -1361,10 +1365,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
-                /* Check if glock is about to be freed */
-                if (atomic_read(&gl->gl_ref) == 0)
-                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
@@ -1375,10 +1375,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
                        }
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        smp_mb__after_clear_bit();
                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                                gfs2_glock_put_nolock(gl);
                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
                        continue;
                }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d0..13f0bd228132 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
        return gl->gl_state == LM_ST_SHARED;
 }
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c39..78554acc0605 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
+#include <linux/posix_acl.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
        if (flags & DIO_METADATA) {
                struct address_space *mapping = gl->gl_aspace->i_mapping;
                truncate_inode_pages(mapping, 0);
-                if (ip)
+                if (ip) {
                        set_bit(GIF_INVALID, &ip->i_flags);
+                        forget_all_cached_acls(&ip->i_inode);
+                }
        }
        if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b3..4792200978c8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,7 +429,11 @@ struct gfs2_args {
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
        unsigned int ar_errors:2;               /* errors=withdraw | panic */
+        unsigned int ar_nobarrier:1;            /* do not send barriers */
        int ar_commit;                          /* Commit interval */
+        int ar_statfs_quantum;                  /* The fast statfs interval */
+        int ar_quota_quantum;                   /* The quota interval */
+        int ar_statfs_percent;                  /* The % change to force sync */
 };
 struct gfs2_tune {
@@ -558,6 +562,7 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
+        int sd_statfs_force_sync;
        /* Resource group stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f409..26ba2a4c4a2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -871,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_acl_create(dip, GFS2_I(inode));
+        error = gfs2_acl_create(dip, inode);
        if (error)
                goto fail_gunlock2;
@@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-        str->di_header.__pad0 = 0;
        str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-        str->di_header.__pad1 = 0;
        str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
        str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
        str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..4511b08fc451 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
        lh->lh_flags = cpu_to_be32(flags);
        lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..de97632ba32f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_meta_header *mh;
        struct gfs2_trans *tr;
        lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        gfs2_meta_check(sdp, bd->bd_bh);
        gfs2_pin(sdp, bd->bd_bh);
+        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+        mh->__pad0 = cpu_to_be64(0);
+        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        sdp->sd_log_num_buf++;
        list_add(&le->le_list, &sdp->sd_log_le_buf);
        tr->tr_num_buf_new++;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c048981..edfee24f3636 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/slow-work.h>
+#include <linux/quotaops.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -62,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
        gt->gt_quota_scale_den = 1;
-        gt->gt_quota_quantum = 60;
        gt->gt_new_files_jdata = 0;
        gt->gt_max_readahead = 1 << 18;
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
-        gt->gt_statfs_quantum = 30;
-        gt->gt_statfs_slow = 0;
 }
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -1114,7 +1112,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
 * Returns: errno
 */
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_holder mount_gh;
@@ -1125,17 +1123,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
                return -ENOMEM;
        }
+        sdp->sd_args = *args;
-        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
-        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
-        sdp->sd_args.ar_commit = 60;
-        sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
-        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
-        if (error) {
-                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
-                goto fail;
-        }
        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1131,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
        sb->s_export_op = &gfs2_export_ops;
        sb->s_xattr = gfs2_xattr_handlers;
+        sb->s_qcop = &gfs2_quotactl_ops;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
        sb->s_time_gran = 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1160,6 +1152,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+        if (sdp->sd_args.ar_statfs_quantum) {
+                sdp->sd_tune.gt_statfs_slow = 0;
+                sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+        }
+        else {
+                sdp->sd_tune.gt_statfs_slow = 1;
+                sdp->sd_tune.gt_statfs_quantum = 30;
+        }
        error = init_names(sdp, silent);
        if (error)
@@ -1243,18 +1244,127 @@ fail:
        return error;
 }
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+static int set_gfs2_super(struct super_block *s, void *data)
-                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+        s->s_bdev = data;
+        s->s_dev = s->s_bdev->bd_dev;
+        /*
+         * We set the bdi here to the queue backing, file systems can
+         * overwrite this in ->fill_super()
+         */
+        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+        return 0;
 }
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
 {
        struct block_device *bdev = ptr;
        return (bdev == s->s_bdev);
 }
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ *    of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+                       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct block_device *bdev;
+        struct super_block *s;
+        fmode_t mode = FMODE_READ;
+        int error;
+        struct gfs2_args args;
+        struct gfs2_sbd *sdp;
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        /*
+         * once the super is inserted into the list by sget, s_umount
+         * will protect the lockfs code from trying to start a snapshot
+         * while we are mounting
+         */
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (bdev->bd_fsfreeze_count > 0) {
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                error = -EBUSY;
+                goto error_bdev;
+        }
+        s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+        error = PTR_ERR(s);
+        if (IS_ERR(s))
+                goto error_bdev;
+        memset(&args, 0, sizeof(args));
+        args.ar_quota = GFS2_QUOTA_DEFAULT;
+        args.ar_data = GFS2_DATA_DEFAULT;
+        args.ar_commit = 60;
+        args.ar_statfs_quantum = 30;
+        args.ar_quota_quantum = 60;
+        args.ar_errors = GFS2_ERRORS_DEFAULT;
+        error = gfs2_mount_args(&args, data);
+        if (error) {
+                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                if (s->s_root)
+                        goto error_super;
+                deactivate_locked_super(s);
+                return error;
+        }
+        if (s->s_root) {
+                error = -EBUSY;
+                if ((flags ^ s->s_flags) & MS_RDONLY)
+                        goto error_super;
+                close_bdev_exclusive(bdev, mode);
+        } else {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                s->s_mode = mode;
+                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                sb_set_blocksize(s, block_size(bdev));
+                error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+                bdev->bd_super = s;
+        }
+        sdp = s->s_fs_info;
+        mnt->mnt_sb = s;
+        if (args.ar_meta)
+                mnt->mnt_root = dget(sdp->sd_master_dir);
+        else
+                mnt->mnt_root = dget(sdp->sd_root_dir);
+        return 0;
+error_super:
+        deactivate_locked_super(s);
+error_bdev:
+        close_bdev_exclusive(bdev, mode);
+        return error;
+}
 static int set_meta_super(struct super_block *s, void *ptr)
 {
        return -EINVAL;
@@ -1274,13 +1384,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                       dev_name, error);
                return error;
        }
-        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
                return PTR_ERR(s);
        }
+        if ((flags ^ s->s_flags) & MS_RDONLY) {
+                deactivate_locked_super(s);
+                return -EBUSY;
+        }
        sdp = s->s_fs_info;
        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc9..e3bf6eab8750 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
 * fuzziness in the current usage value of IDs that are being used on different
 * nodes in the cluster simultaneously.  So, it is possible for a user on
 * multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
 * program to be run on node crashes or anything like that.
 *
 * There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -65,13 +67,6 @@
 #define QUOTA_USER 1
 #define QUOTA_GROUP 0
-struct gfs2_quota_host {
-        u64 qu_limit;
-        u64 qu_warn;
-        s64 qu_value;
-        u32 qu_ll_next;
-};
 struct gfs2_quota_change_host {
        u64 qc_change;
        u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
        return error;
 }
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
                  struct gfs2_quota_data **qdp)
 {
        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                spin_unlock(&qd_lru_lock);
-                if (qd || !create) {
+                if (qd) {
                        if (new_qd) {
                                gfs2_glock_put(new_qd->qd_gl);
                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
        qd_put(qd);
 }
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
                    struct gfs2_quota_data **qdp)
 {
        int error;
-        error = qd_get(sdp, user, id, create, qdp);
+        error = qd_get(sdp, user, id, qdp);
        if (error)
                return error;
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
                return 0;
-        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
-        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
        if (error)
                goto out;
        al->al_qd_num++;
        qd++;
        if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
-                error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_USER, uid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        }
        if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
-                error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+                error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
                if (error)
                        goto out;
                al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
        mutex_unlock(&sdp->sd_quota_mutex);
 }
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
-        const struct gfs2_quota *str = buf;
-        qu->qu_limit = be64_to_cpu(str->qu_limit);
-        qu->qu_warn = be64_to_cpu(str->qu_warn);
-        qu->qu_value = be64_to_cpu(str->qu_value);
-        qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
-        struct gfs2_quota *str = buf;
-        str->qu_limit = cpu_to_be64(qu->qu_limit);
-        str->qu_warn = cpu_to_be64(qu->qu_warn);
-        str->qu_value = cpu_to_be64(qu->qu_value);
-        str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
-        memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
 /**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
 *
 * This function was mostly borrowed from gfs2_block_truncate_page which was
 * in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
 */
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
-                             s64 change, struct gfs2_quota_data *qd)
+                             s64 change, struct gfs2_quota_data *qd,
+                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, pos;
-        struct buffer_head *bh;
+        struct buffer_head *bh, *dibh;
        struct page *page;
        void *kaddr;
-        char *ptr;
+        struct gfs2_quota *qp;
-        struct gfs2_quota_host qp;
        s64 value;
        int err = -EIO;
+        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        ptr = kaddr + offset;
+        qp = kaddr + offset;
-        gfs2_quota_in(&qp, ptr);
+        value = (s64)be64_to_cpu(qp->qu_value) + change;
-        qp.qu_value += change;
+        qp->qu_value = cpu_to_be64(value);
-        value = qp.qu_value;
+        qd->qd_qb.qb_value = qp->qu_value;
-        gfs2_quota_out(&qp, ptr);
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
-        err = 0;
-        qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+        err = gfs2_meta_inode_buffer(ip, &dibh);
-        qd->qd_qb.qb_value = cpu_to_be64(value);
+        if (err)
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
+                goto unlock;
-        ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+        size = loc + sizeof(struct gfs2_quota);
+        if (size > inode->i_size) {
+                ip->i_disksize = size;
+                i_size_write(inode, size);
+        }
+        inode->i_mtime = inode->i_atime = CURRENT_TIME;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(ip, dibh->b_data);
+        brelse(dibh);
+        mark_inode_dirty(inode);
 unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                return -ENOMEM;
        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+        mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
        for (qx = 0; qx < num_qd; qx++) {
-                error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+                error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE,
                                           GL_NOCACHE, &ghs[qx]);
                if (error)
                        goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        for (x = 0; x < num_qd; x++) {
                qd = qda[x];
                offset = qd2offset(qd);
-                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
-                                          (struct gfs2_quota_data *)
-                                          qd);
                if (error)
                        goto out_end_trans;
@@ -817,21 +818,44 @@ out_gunlock:
 out:
        while (qx--)
                gfs2_glock_dq_uninit(&ghs[qx]);
+        mutex_unlock(&ip->i_inode.i_mutex);
        kfree(ghs);
        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
        return error;
 }
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota q;
+        struct gfs2_quota_lvb *qlvb;
+        loff_t pos;
+        int error;
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        pos = qd2offset(qd);
+        error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+        if (error < 0)
+                return error;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+        qlvb->__pad = 0;
+        qlvb->qb_limit = q.qu_limit;
+        qlvb->qb_warn = q.qu_warn;
+        qlvb->qb_value = q.qu_value;
+        qd->qd_qb = *qlvb;
+        return 0;
+}
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
                    struct gfs2_holder *q_gh)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_holder i_gh;
-        struct gfs2_quota_host q;
-        char buf[sizeof(struct gfs2_quota)];
        int error;
-        struct gfs2_quota_lvb *qlvb;
 restart:
        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
-                loff_t pos;
                gfs2_glock_dq_uninit(q_gh);
-                error = gfs2_glock_nq_init(qd->qd_gl,
+                error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
-                                           LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                           GL_NOCACHE, q_gh);
-                                           q_gh);
                if (error)
                        return error;
@@ -853,29 +875,14 @@ restart:
                if (error)
                        goto fail;
-                memset(buf, 0, sizeof(struct gfs2_quota));
+                error = update_qd(sdp, qd);
-                pos = qd2offset(qd);
+                if (error)
-                error = gfs2_internal_read(ip, NULL, buf, &pos,
-                                           sizeof(struct gfs2_quota));
-                if (error < 0)
                        goto fail_gunlock;
                gfs2_glock_dq_uninit(&i_gh);
+                gfs2_glock_dq_uninit(q_gh);
-                gfs2_quota_in(&q, buf);
+                force_refresh = 0;
-                qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+                goto restart;
-                qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
-                qlvb->__pad = 0;
-                qlvb->qb_limit = cpu_to_be64(q.qu_limit);
-                qlvb->qb_warn = cpu_to_be64(q.qu_warn);
-                qlvb->qb_value = cpu_to_be64(q.qu_value);
-                qd->qd_qb = *qlvb;
-                if (gfs2_glock_is_blocking(qd->qd_gl)) {
-                        gfs2_glock_dq_uninit(q_gh);
-                        force_refresh = 0;
-                        goto restart;
-                }
        }
        return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
               sdp->sd_fsname, type,
               (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
               qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
                        print_message(qd, "exceeded");
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
                        error = -EDQUOT;
                        break;
                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
                           time_after_eq(jiffies, qd->qd_last_warn +
                                         gfs2_tune_get(sdp,
                                                gt_quota_warn_period) * HZ)) {
+                        quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+                                           USRQUOTA : GRPQUOTA, qd->qd_id,
+                                           sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
                        error = print_message(qd, "warning");
                        qd->qd_last_warn = jiffies;
                }
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        }
 }
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_data **qda;
        unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
        unsigned int num_qd;
@@ -1118,7 +1133,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
        struct gfs2_holder q_gh;
        int error;
-        error = qd_get(sdp, user, id, CREATE, &qd);
+        error = qd_get(sdp, user, id, &qd);
        if (error)
                return error;
@@ -1127,7 +1142,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
                gfs2_glock_dq_uninit(&q_gh);
        qd_put(qd);
        return error;
 }
@@ -1298,12 +1312,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
 }
 static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
-                               int (*fxn)(struct gfs2_sbd *sdp),
+                               int (*fxn)(struct super_block *sb, int type),
                               unsigned long t, unsigned long *timeo,
                               unsigned int *new_timeo)
 {
        if (t >= *timeo) {
-                int error = fxn(sdp);
+                int error = fxn(sdp->sd_vfs, 0);
                quotad_error(sdp, msg, error);
                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
        } else {
@@ -1330,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+        if (!sdp->sd_statfs_force_sync) {
+                sdp->sd_statfs_force_sync = 1;
+                wake_up(&sdp->sd_quota_wait);
+        }
+}
 /**
 * gfs2_quotad - Write cached quota changes into the quota file
 * @sdp: Pointer to GFS2 superblock
@@ -1349,8 +1371,15 @@ int gfs2_quotad(void *data)
        while (!kthread_should_stop()) {
                /* Update the master statfs file */
-                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                if (sdp->sd_statfs_force_sync) {
-                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                        int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+                        quotad_error(sdp, "statfs", error);
+                        statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+                }
+                else
+                        quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                           &statfs_timeo,
+                                           &tune->gt_statfs_quantum);
                /* Update quota file */
                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
@@ -1367,7 +1396,7 @@ int gfs2_quotad(void *data)
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
-                if (empty)
+                if (empty && !sdp->sd_statfs_force_sync)
                        t -= schedule_timeout(t);
                else
                        t = 0;
@@ -1377,3 +1406,181 @@ int gfs2_quotad(void *data)
        return 0;
 }
+static int gfs2_quota_get_xstate(struct super_block *sb,
+                                 struct fs_quota_stat *fqs)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        memset(fqs, 0, sizeof(struct fs_quota_stat));
+        fqs->qs_version = FS_QSTAT_VERSION;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+        if (sdp->sd_quota_inode) {
+                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+        }
+        fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+        fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+        fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+        return 0;
+}
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_quota_lvb *qlvb;
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh;
+        int error;
+        memset(fdq, 0, sizeof(struct fs_disk_quota));
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        if (type == USRQUOTA)
+                type = QUOTA_USER;
+        else if (type == GRPQUOTA)
+                type = QUOTA_GROUP;
+        else
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        error = do_glock(qd, FORCE, &q_gh);
+        if (error)
+                goto out;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        fdq->d_version = FS_DQUOT_VERSION;
+        fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        fdq->d_id = id;
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        gfs2_glock_dq_uninit(&q_gh);
+out:
+        qd_put(qd);
+        return error;
+}
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+                           struct fs_disk_quota *fdq)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh, i_gh;
+        unsigned int data_blocks, ind_blocks;
+        unsigned int blocks = 0;
+        int alloc_required;
+        struct gfs2_alloc *al;
+        loff_t offset;
+        int error;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return -ESRCH; /* Crazy XFS error code */
+        switch(type) {
+        case USRQUOTA:
+                type = QUOTA_USER;
+                if (fdq->d_flags != XFS_USER_QUOTA)
+                        return -EINVAL;
+                break;
+        case GRPQUOTA:
+                type = QUOTA_GROUP;
+                if (fdq->d_flags != XFS_GROUP_QUOTA)
+                        return -EINVAL;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+                return -EINVAL;
+        if (fdq->d_id != id)
+                return -EINVAL;
+        error = qd_get(sdp, type, id, &qd);
+        if (error)
+                return error;
+        mutex_lock(&ip->i_inode.i_mutex);
+        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+        if (error)
+                goto out_put;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                goto out_q;
+        /* Check for existing entry, if none then alloc new blocks */
+        error = update_qd(sdp, qd);
+        if (error)
+                goto out_i;
+        /* If nothing has changed, this is a no-op */
+        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if (fdq->d_fieldmask == 0)
+                goto out_i;
+        offset = qd2offset(qd);
+        error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+                                          &alloc_required);
+        if (error)
+                goto out_i;
+        if (alloc_required) {
+                al = gfs2_alloc_get(ip);
+                if (al == NULL)
+                        goto out_i;
+                gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+                                       &data_blocks, &ind_blocks);
+                blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_alloc;
+        }
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        if (error)
+                goto out_release;
+        /* Apply changes */
+        error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+        gfs2_trans_end(sdp);
+out_release:
+        if (alloc_required) {
+                gfs2_inplace_release(ip);
+out_alloc:
+                gfs2_alloc_put(ip);
+        }
+out_i:
+        gfs2_glock_dq_uninit(&i_gh);
+out_q:
+        gfs2_glock_dq_uninit(&q_gh);
+out_put:
+        mutex_unlock(&ip->i_inode.i_mutex);
+        qd_put(qd);
+        return error;
+}
+const struct quotactl_ops gfs2_quotactl_ops = {
+        .quota_sync     = gfs2_quota_sync,
+        .get_xstate     = gfs2_quota_get_xstate,
+        .get_xquota     = gfs2_xquota_get,
+        .set_xquota     = gfs2_xquota_set,
+};
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e8..e271fa07ad02 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
                              u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type);
 extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
 extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 extern int gfs2_quotad(void *data);
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 }
 extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 09fa31965576..4b9bece3d437 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -410,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        memset(lh, 0, sizeof(struct gfs2_log_header));
        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.__pad0 = cpu_to_be64(0);
        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
        lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
        lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
        lh->lh_blkno = cpu_to_be32(lblock);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6cb..0608f490c295 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1710,11 +1710,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
        struct gfs2_rgrpd *rgd;
        struct gfs2_holder ri_gh, rgd_gh;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+        int ri_locked = 0;
        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
-        if (error)
+                error = gfs2_rindex_hold(sdp, &ri_gh);
-                goto fail;
+                if (error)
+                        goto fail;
+                ri_locked = 1;
+        }
        error = -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1735,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
        gfs2_glock_dq_uninit(&rgd_gh);
 fail_rindex:
-        gfs2_glock_dq_uninit(&ri_gh);
+        if (ri_locked)
+                gfs2_glock_dq_uninit(&ri_gh);
 fail:
        return error;
 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de1..c282ad41f3d1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -70,6 +70,11 @@ enum {
        Opt_commit,
        Opt_err_withdraw,
        Opt_err_panic,
+        Opt_statfs_quantum,
+        Opt_statfs_percent,
+        Opt_quota_quantum,
+        Opt_barrier,
+        Opt_nobarrier,
        Opt_error,
 };
@@ -101,18 +106,23 @@ static const match_table_t tokens = {
        {Opt_commit, "commit=%d"},
        {Opt_err_withdraw, "errors=withdraw"},
        {Opt_err_panic, "errors=panic"},
+        {Opt_statfs_quantum, "statfs_quantum=%d"},
+        {Opt_statfs_percent, "statfs_percent=%d"},
+        {Opt_quota_quantum, "quota_quantum=%d"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_error, NULL}
 };
 /**
 * gfs2_mount_args - Parse mount options
- * @sdp:
+ * @args: The structure into which the parsed options will be written
- * @data:
+ * @options: The options to parse
 *
 * Return: errno
 */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
 {
        char *o;
        int token;
@@ -157,7 +167,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_debug:
                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                       "are mutually exclusive.\n");
                                return -EINVAL;
                        }
@@ -210,7 +220,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                case Opt_commit:
                        rv = match_int(&tmp[0], &args->ar_commit);
                        if (rv || args->ar_commit <= 0) {
-                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_quantum:
+                        rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+                        if (rv || args->ar_statfs_quantum < 0) {
+                                printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_quota_quantum:
+                        rv = match_int(&tmp[0], &args->ar_quota_quantum);
+                        if (rv || args->ar_quota_quantum <= 0) {
+                                printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_statfs_percent:
+                        rv = match_int(&tmp[0], &args->ar_statfs_percent);
+                        if (rv || args->ar_statfs_percent < 0 ||
+                            args->ar_statfs_percent > 100) {
+                                printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
@@ -219,15 +251,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        break;
                case Opt_err_panic:
                        if (args->ar_debug) {
-                                fs_info(sdp, "-o debug and -o errors=panic "
+                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
                                        "are mutually exclusive.\n");
                                return -EINVAL;
                        }
                        args->ar_errors = GFS2_ERRORS_PANIC;
                        break;
+                case Opt_barrier:
+                        args->ar_nobarrier = 0;
+                        break;
+                case Opt_nobarrier:
+                        args->ar_nobarrier = 1;
+                        break;
                case Opt_error:
                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
                        return -EINVAL;
                }
        }
@@ -442,7 +480,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 {
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct buffer_head *l_bh;
+        s64 x, y;
+        int need_sync = 0;
        int error;
        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +497,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        l_sc->sc_free += free;
        l_sc->sc_dinodes += dinodes;
        gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+        if (sdp->sd_args.ar_statfs_percent) {
+                x = 100 * l_sc->sc_free;
+                y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+                if (x >= y || x <= -y)
+                        need_sync = 1;
+        }
        spin_unlock(&sdp->sd_statfs_spin);
        brelse(l_bh);
+        if (need_sync)
+                gfs2_wake_up_statfs(sdp);
 }
 void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +533,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
 {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +571,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
                goto out_bh2;
        update_statfs(sdp, m_bh, l_bh);
+        sdp->sd_statfs_force_sync = 0;
        gfs2_trans_end(sdp);
@@ -712,8 +763,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        int error;
        flush_workqueue(gfs2_delete_workqueue);
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0);
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
                                   &t_gh);
@@ -1061,8 +1112,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        spin_lock(&gt->gt_spin);
        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_quota_quantum = gt->gt_quota_quantum;
+        if (gt->gt_statfs_slow)
+                args.ar_statfs_quantum = 0;
+        else
+                args.ar_statfs_quantum = gt->gt_statfs_quantum;
        spin_unlock(&gt->gt_spin);
-        error = gfs2_mount_args(sdp, &args, data);
+        error = gfs2_mount_args(&args, data);
        if (error)
                return error;
@@ -1097,8 +1153,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
                sb->s_flags |= MS_POSIXACL;
        else
                sb->s_flags &= ~MS_POSIXACL;
+        if (sdp->sd_args.ar_nobarrier)
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+        else
+                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_quota_quantum = args.ar_quota_quantum;
+        if (args.ar_statfs_quantum) {
+                gt->gt_statfs_slow = 0;
+                gt->gt_statfs_quantum = args.ar_statfs_quantum;
+        }
+        else {
+                gt->gt_statfs_slow = 1;
+                gt->gt_statfs_quantum = 30;
+        }
        spin_unlock(&gt->gt_spin);
        gfs2_online_uevent(sdp);
@@ -1179,7 +1248,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
        struct gfs2_args *args = &sdp->sd_args;
-        int lfsecs;
+        int val;
        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
                seq_printf(s, ",meta");
@@ -1240,9 +1309,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_log_flush_secs;
-        if (lfsecs != 60)
+        if (val != 60)
-                seq_printf(s, ",commit=%d", lfsecs);
+                seq_printf(s, ",commit=%d", val);
+        val = sdp->sd_tune.gt_statfs_quantum;
+        if (val != 30)
+                seq_printf(s, ",statfs_quantum=%d", val);
+        val = sdp->sd_tune.gt_quota_quantum;
+        if (val != 60)
+                seq_printf(s, ",quota_quantum=%d", val);
+        if (args->ar_statfs_percent)
+                seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
        if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
                const char *state;
@@ -1259,6 +1336,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                }
                seq_printf(s, ",errors=%s", state);
        }
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+                seq_printf(s, ",nobarrier");
        return 0;
 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db3682885..3df60f2d84e3 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
                                  const void *buf);
 extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
                          struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d52..c5dad1eb7b91 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -158,7 +158,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_statfs_sync(sdp);
+        gfs2_statfs_sync(sdp->sd_vfs, 0);
        return len;
 }
@@ -171,13 +171,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_quota_sync(sdp);
+        gfs2_quota_sync(sdp->sd_vfs, 0);
        return len;
 }
 static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
                                        size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +186,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 1, id);
+        error = gfs2_quota_refresh(sdp, 1, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
                                         size_t len)
 {
+        int error;
        u32 id;
        if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +201,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
        id = simple_strtoul(buf, NULL, 0);
-        gfs2_quota_refresh(sdp, 0, id);
+        error = gfs2_quota_refresh(sdp, 0, id);
-        return len;
+        return error ? error : len;
 }
 static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee27..912f5cbc4740 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
        return 0;
 }
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-                 struct gfs2_ea_location *el)
+                        struct gfs2_ea_location *el)
 {
        struct ea_find ef;
        int error;
@@ -516,8 +516,8 @@ out:
        return error;
 }
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data, size_t size)
+                            char *data, size_t size)
 {
        int ret;
        size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,6 +534,36 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
        return len;
 }
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+        struct gfs2_ea_location el;
+        int error;
+        int len;
+        char *data;
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                goto out;
+        if (!GFS2_EA_DATA_LEN(el.el_ea))
+                goto out;
+        len = GFS2_EA_DATA_LEN(el.el_ea);
+        data = kmalloc(len, GFP_NOFS);
+        error = -ENOMEM;
+        if (data == NULL)
+                goto out;
+        error = gfs2_ea_get_copy(ip, &el, data, len);
+        if (error == 0)
+                error = len;
+        *ppdata = data;
+out:
+        brelse(el.el_bh);
+        return error;
+}
 /**
 * gfs2_xattr_get - Get a GFS2 extended attribute
 * @inode: The inode
@@ -1259,22 +1289,26 @@ fail:
        return error;
 }
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-                      struct iattr *attr, char *data)
 {
+        struct gfs2_ea_location el;
        struct buffer_head *dibh;
        int error;
-        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+        if (error)
+                return error;
+        if (GFS2_EA_IS_STUFFED(el.el_ea)) {
                error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
                if (error)
                        return error;
-                gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+                gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
-                memcpy(GFS2_EA2DATA(el->el_ea), data,
+                memcpy(GFS2_EA2DATA(el.el_ea), data,
-                       GFS2_EA_DATA_LEN(el->el_ea));
+                       GFS2_EA_DATA_LEN(el.el_ea));
        } else
-                error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+                error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
        if (error)
                return error;
@@ -1507,18 +1541,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name,
        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
 }
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-                                 void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-                                 const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
 static int gfs2_xattr_security_get(struct inode *inode, const char *name,
                                   void *buffer, size_t size)
 {
@@ -1543,12 +1565,6 @@ static struct xattr_handler gfs2_xattr_security_handler = {
        .set    = gfs2_xattr_security_set,
 };
-static struct xattr_handler gfs2_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .get    = gfs2_xattr_system_get,
-        .set    = gfs2_xattr_system_set,
-};
 struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd7743733..8d6ae5813c4d 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,11 +62,7 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-                        struct gfs2_ea_location *el);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                            char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                             struct iattr *attr, char *data);
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be55976..06c1f02de611 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,7 +18,6 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -157,11 +156,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        if (security_inode_alloc(inode))
                goto out;
-        /* allocate and initialize an i_integrity */
-        if (ima_inode_alloc(inode))
-                goto out_free_security;
        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -201,9 +195,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #endif
        return 0;
-out_free_security:
-        security_inode_free(inode);
 out:
        return -ENOMEM;
 }
@@ -235,7 +226,6 @@ static struct inode *alloc_inode(struct super_block *sb)
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
-        ima_inode_free(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
 #ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..8896c1d4febe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                JBUFFER_TRACE(jh, "ph3: write metadata");
                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
                                                      jh, &new_jh, blocknr);
+                if (flags < 0) {
+                        jbd2_journal_abort(journal, flags);
+                        continue;
+                }
                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
                wbuf[bufs++] = jh2bh(new_jh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..b7ca3a92a4db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -358,6 +359,10 @@ repeat:
                jbd_unlock_bh_state(bh_in);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+                if (!tmp) {
+                        jbd2_journal_put_journal_head(new_jh);
+                        return -ENOMEM;
+                }
                jbd_lock_bh_state(bh_in);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
@@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal)
        if (jbd2_journal_recover(journal))
                goto recovery_error;
+        if (journal->j_failed_commit) {
+                printk(KERN_ERR "JBD2: journal transaction %u on %s "
+                       "is corrupt.\n", journal->j_failed_commit,
+                       journal->j_devname);
+                return -EIO;
+        }
        /* OK, we've finished with the dynamic journal bits:
         * reinitialise the dynamic contents of the superblock in memory
         * and reset them on disk. */
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                spin_unlock(&jffs2_compressor_list_lock);
                break;
        default:
-                printk(KERN_ERR "JFFS2: unknow compression mode.\n");
+                printk(KERN_ERR "JFFS2: unknown compression mode.\n");
        }
 out:
        if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..378991cfe40f 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 * Helper function for jffs2_get_inode_nodes().
 * The function detects whether more data should be read and reads it if yes.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          negative error code on failure.
 */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..4b107881acd5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
 *   is used to release xattr name/value pair and detach from c->xattrindex.
 * reclaim_xattr_datum(c)
 *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
 *   is hard coded as 32KiB.
 * do_verify_xattr_datum(c, xd)
 *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..d9b031cf69f5 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -755,7 +755,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
         * allocation group.
         */
        if ((blkno & (bmp->db_agsize - 1)) == 0)
-                /* check if the AG is currenly being written to.
+                /* check if the AG is currently being written to.
                 * if so, call dbNextAG() to find a non-busy
                 * AG with sufficient free space.
                 */
@@ -3337,7 +3337,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
        for (i = 0, n = 0; i < agno; n++) {
                bmp->db_agfree[n] = 0;  /* init collection point */
-                /* coalesce cotiguous k AGs; */
+                /* coalesce contiguous k AGs; */
                for (j = 0; j < k && i < agno; j++, i++) {
                        /* merge AGi to AGn */
                        bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae14a192..e50cfa3d9654 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -371,82 +371,74 @@ EXPORT_SYMBOL_GPL(lockd_down);
 static ctl_table nlm_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_grace_period",
                .data           = &nlm_grace_period,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_grace_period_min,
                .extra2         = (unsigned long *) &nlm_grace_period_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_timeout",
                .data           = &nlm_timeout,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
                .extra1         = (unsigned long *) &nlm_timeout_min,
                .extra2         = (unsigned long *) &nlm_timeout_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_udpport",
                .data           = &nlm_udpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nlm_tcpport",
                .data           = &nlm_tcpport,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_use_hostnames",
                .data           = &nsm_use_hostnames,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nsm_local_state",
                .data           = &nsm_local_state,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_dir[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs",
                .mode           = 0555,
                .child          = nlm_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nlm_sysctl_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = nlm_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif  /* CONFIG_SYSCTL */
diff --git a/fs/namei.c b/fs/namei.c
index d11f404667e9..d3c190c35fcc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1279,28 +1279,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        return __lookup_hash(&this, base, NULL);
 }
-/**
- * lookup_one_noperm - bad hack for sysfs
- * @name:       pathname component to lookup
- * @base:       base directory to lookup from
- *
- * This is a variant of lookup_one_len that doesn't perform any permission
- * checks.   It's a horrible hack to work around the braindead sysfs
- * architecture and should not be used anywhere else.
- *
- * DON'T USE THIS FUNCTION EVER, thanks.
- */
-struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
-{
-        int err;
-        struct qstr this;
-        err = __lookup_one_len(name, &this, base, strlen(name));
-        if (err)
-                return ERR_PTR(err);
-        return __lookup_hash(&this, base, NULL);
-}
 int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd222..7d70d63ceb29 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
+        /* ... and get the mountpoint */
+        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+        if (retval)
+                return retval;
+        retval = security_sb_mount(dev_name, &path,
+                                   type_page, flags, data_page);
+        if (retval)
+                goto dput_out;
        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;
@@ -1945,16 +1955,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
-        /* ... and get the mountpoint */
-        retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
-        if (retval)
-                return retval;
-        retval = security_sb_mount(dev_name, &path,
-                                   type_page, flags, data_page);
-        if (retval)
-                goto dput_out;
        if (flags & MS_REMOUNT)
                retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
                                    data_page);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..ec8f45f12e05 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -835,7 +835,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        case NCP_IOC_SETROOT:
                return 0;
        default:
-                /* unkown IOCTL command, assume write */
+                /* unknown IOCTL command, assume write */
                return 1;
        }
 }
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index b62481dabae9..70e1fbbaaeab 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -22,63 +22,55 @@ static struct ctl_table_header *nfs_callback_sysctl_table;
 static ctl_table nfs_cb_sysctls[] = {
 #ifdef CONFIG_NFS_V4
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs_callback_tcpport",
                .data = &nfs_callback_set_tcpport,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_minmax,
+                .proc_handler = proc_dointvec_minmax,
                .extra1 = (int *)&nfs_set_port_min,
                .extra2 = (int *)&nfs_set_port_max,
        },
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "idmap_cache_timeout",
                .data = &nfs_idmap_cache_timeout,
                .maxlen = sizeof(int),
                .mode = 0644,
-                .proc_handler = &proc_dointvec_jiffies,
+                .proc_handler = proc_dointvec_jiffies,
-                .strategy = &sysctl_jiffies,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_mountpoint_timeout",
                .data           = &nfs_mountpoint_expiry_timeout,
                .maxlen         = sizeof(nfs_mountpoint_expiry_timeout),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nfs_congestion_kb",
                .data           = &nfs_congestion_kb,
                .maxlen         = sizeof(nfs_congestion_kb),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_dir[] = {
        {
-                .ctl_name = CTL_UNNUMBERED,
                .procname = "nfs",
                .mode = 0555,
                .child = nfs_cb_sysctls,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table nfs_cb_sysctl_root[] = {
        {
-                .ctl_name = CTL_FS,
                .procname = "fs",
                .mode = 0555,
                .child = nfs_cb_sysctl_dir,
        },
-        { .ctl_name = 0 }
+        { }
 };
 int nfs_register_sysctl(void)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..c84b5cc1a943 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
                return FLUSH_HIGHPRI | FLUSH_STABLE;
-        if (wbc->for_kupdate)
+        if (wbc->for_kupdate || wbc->for_background)
                return FLUSH_LOWPRI;
        return 0;
 }
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae59251..3f959f1879d8 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -142,29 +142,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode,
        }
 }
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+                                  int create,
+                                  void (*init_block)(struct inode *,
+                                                     struct buffer_head *,
+                                                     void *),
+                                  struct buffer_head **bhp,
+                                  struct nilfs_bh_assoc *prev,
+                                  spinlock_t *lock)
+{
+        int ret;
+        spin_lock(lock);
+        if (prev->bh && blkoff == prev->blkoff) {
+                get_bh(prev->bh);
+                *bhp = prev->bh;
+                spin_unlock(lock);
+                return 0;
+        }
+        spin_unlock(lock);
+        ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+        if (!ret) {
+                spin_lock(lock);
+                /*
+                 * The following code must be safe for change of the
+                 * cache contents during the get block call.
+                 */
+                brelse(prev->bh);
+                get_bh(*bhp);
+                prev->bh = *bhp;
+                prev->blkoff = blkoff;
+                spin_unlock(lock);
+        }
+        return ret;
+}
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode,
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   nilfs_palloc_desc_blkoff(inode, group),
-                                   create, nilfs_palloc_desc_block_init, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_desc_blkoff(inode, group),
+                                      create, nilfs_palloc_desc_block_init,
+                                      bhp, &cache->prev_desc, &cache->lock);
 }
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode,
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   nilfs_palloc_bitmap_blkoff(inode, group),
-                                   create, NULL, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_bitmap_blkoff(inode, group),
+                                      create, NULL, bhp,
+                                      &cache->prev_bitmap, &cache->lock);
 }
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
-        return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
-                                   create, NULL, bhp);
+        return nilfs_palloc_get_block(inode,
+                                      nilfs_palloc_entry_blkoff(inode, nr),
+                                      create, NULL, bhp,
+                                      &cache->prev_entry, &cache->lock);
 }
 static struct nilfs_palloc_group_desc *
@@ -176,13 +222,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
-static unsigned char *
-nilfs_palloc_block_get_bitmap(const struct inode *inode,
-                              const struct buffer_head *bh, void *kaddr)
-{
-        return (unsigned char *)(kaddr + bh_offset(bh));
-}
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -289,8 +328,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                if (ret < 0)
                                        goto out_desc;
                                bitmap_kaddr = kmap(bitmap_bh->b_page);
-                                bitmap = nilfs_palloc_block_get_bitmap(
+                                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-                                        inode, bitmap_bh, bitmap_kaddr);
                                pos = nilfs_palloc_find_available_slot(
                                        inode, group, group_offset, bitmap,
                                        entries_per_group);
@@ -351,8 +389,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        desc = nilfs_palloc_block_get_group_desc(inode, group,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-                                               bitmap_kaddr);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
@@ -385,8 +422,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        desc = nilfs_palloc_block_get_group_desc(inode, group,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-        bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-                                               bitmap_kaddr);
        if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +508,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                desc = nilfs_palloc_block_get_group_desc(
                        inode, group, desc_bh, desc_kaddr);
                bitmap_kaddr = kmap(bitmap_bh->b_page);
-                bitmap = nilfs_palloc_block_get_bitmap(
+                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-                        inode, bitmap_bh, bitmap_kaddr);
                for (j = i, n = 0;
                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
                                                              entry_nrs[j]);
@@ -502,3 +537,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
        }
        return 0;
 }
+void nilfs_palloc_setup_cache(struct inode *inode,
+                              struct nilfs_palloc_cache *cache)
+{
+        NILFS_MDT(inode)->mi_palloc_cache = cache;
+        spin_lock_init(&cache->lock);
+}
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+        struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+        spin_lock(&cache->lock);
+        brelse(cache->prev_desc.bh);
+        brelse(cache->prev_bitmap.bh);
+        brelse(cache->prev_entry.bh);
+        cache->prev_desc.bh = NULL;
+        cache->prev_bitmap.bh = NULL;
+        cache->prev_entry.bh = NULL;
+        spin_unlock(&cache->lock);
+}
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+        nilfs_palloc_clear_cache(inode);
+        NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475c2c7..f4543ac4f560 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+/*
+ * persistent object allocator cache
+ */
+struct nilfs_bh_assoc {
+        unsigned long blkoff;
+        struct buffer_head *bh;
+};
+struct nilfs_palloc_cache {
+        spinlock_t lock;
+        struct nilfs_bh_assoc prev_desc;
+        struct nilfs_bh_assoc prev_bitmap;
+        struct nilfs_bh_assoc prev_entry;
+};
+void nilfs_palloc_setup_cache(struct inode *inode,
+                              struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
 #endif  /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 08834df6ec68..f4a14ea2ed9c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
 {
        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-        if (NILFS_MDT(bmap->b_inode))
-                nilfs_mdt_mark_dirty(bmap->b_inode);
-        else
-                mark_inode_dirty(bmap->b_inode);
 }
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
 {
        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-        if (NILFS_MDT(bmap->b_inode))
-                nilfs_mdt_mark_dirty(bmap->b_inode);
-        else
-                mark_inode_dirty(bmap->b_inode);
 }
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 84c25382f8e3..471e269536ae 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,9 +68,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
        truncate_inode_pages(btnc, 0);
 }
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+        struct inode *inode = NILFS_BTNC_I(btnc);
+        struct buffer_head *bh;
+        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+        if (unlikely(!bh))
+                return NULL;
+        if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+                     buffer_dirty(bh))) {
+                brelse(bh);
+                BUG();
+        }
+        memset(bh->b_data, 0, 1 << inode->i_blkbits);
+        bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+        bh->b_blocknr = blocknr;
+        set_buffer_mapped(bh);
+        set_buffer_uptodate(bh);
+        unlock_page(bh->b_page);
+        page_cache_release(bh->b_page);
+        return bh;
+}
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-                              sector_t pblocknr, struct buffer_head **pbh,
+                              sector_t pblocknr, struct buffer_head **pbh)
-                              int newblk)
 {
        struct buffer_head *bh;
        struct inode *inode = NILFS_BTNC_I(btnc);
@@ -81,19 +106,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                return -ENOMEM;
        err = -EEXIST; /* internal code */
-        if (newblk) {
-                if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
-                             buffer_dirty(bh))) {
-                        brelse(bh);
-                        BUG();
-                }
-                memset(bh->b_data, 0, 1 << inode->i_blkbits);
-                bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
-                bh->b_blocknr = blocknr;
-                set_buffer_mapped(bh);
-                set_buffer_uptodate(bh);
-                goto found;
-        }
        if (buffer_uptodate(bh) || buffer_dirty(bh))
                goto found;
@@ -135,27 +147,6 @@ out_locked:
        return err;
 }
-int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
-                     sector_t pblocknr, struct buffer_head **pbh, int newblk)
-{
-        struct buffer_head *bh;
-        int err;
-        err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
-        if (err == -EEXIST) /* internal code (cache hit) */
-                return 0;
-        if (unlikely(err))
-                return err;
-        bh = *pbh;
-        wait_on_buffer(bh);
-        if (!buffer_uptodate(bh)) {
-                brelse(bh);
-                return -EIO;
-        }
-        return 0;
-}
 /**
 * nilfs_btnode_delete - delete B-tree node buffer
 * @bh: buffer to be deleted
@@ -244,12 +235,13 @@ retry:
                unlock_page(obh->b_page);
        }
-        err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+        nbh = nilfs_btnode_create_block(btnc, newkey);
-        if (likely(!err)) {
+        if (!nbh)
-                BUG_ON(nbh == obh);
+                return -ENOMEM;
-                ctxt->newbh = nbh;
-        }
+        BUG_ON(nbh == obh);
-        return err;
+        ctxt->newbh = nbh;
+        return 0;
 failed_unlock:
        unlock_page(obh->b_page);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e2275172ed6..07da83f07712 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt {
 void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+                                              __u64 blocknr);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
-                              struct buffer_head **, int);
+                              struct buffer_head **);
-int nilfs_btnode_get(struct address_space *, __u64, sector_t,
-                     struct buffer_head **, int);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
                                    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index e25b507a474f..7cdd98b8d514 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
 {
        struct address_space *btnc =
                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+        int err;
+        err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
+        if (err)
+                return err == -EEXIST ? 0 : err;
+        wait_on_buffer(*bhp);
+        if (!buffer_uptodate(*bhp)) {
+                brelse(*bhp);
+                return -EIO;
+        }
+        return 0;
 }
 static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 {
        struct address_space *btnc =
                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        int ret;
+        struct buffer_head *bh;
-        ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+        bh = nilfs_btnode_create_block(btnc, ptr);
-        if (!ret)
+        if (!bh)
-                set_buffer_nilfs_volatile(*bhp);
+                return -ENOMEM;
-        return ret;
+        set_buffer_nilfs_volatile(bh);
+        *bhp = bh;
+        return 0;
 }
 static inline int
@@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
                nilfs_btree_get_nonroot_node(path, level);
 }
+static inline int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+        if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+                dump_stack();
+                printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+                       nilfs_btree_node_get_level(node), level);
+                return 1;
+        }
+        return 0;
+}
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                                 struct nilfs_btree_path *path,
                                 __u64 key, __u64 *ptrp, int minlevel)
@@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(node));
+                if (nilfs_btree_bad_node(node, level))
+                        return -EINVAL;
                if (!found)
                        found = nilfs_btree_node_lookup(node, key, &index);
                else
@@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(node));
+                if (nilfs_btree_bad_node(node, level))
+                        return -EINVAL;
                index = nilfs_btree_node_get_nchildren(node) - 1;
                ptr = nilfs_btree_node_get_ptr(btree, node, index);
                path[level].bp_index = index;
@@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 {
        if (level < nilfs_btree_height(btree) - 1) {
                do {
-                        lock_buffer(path[level].bp_bh);
                        nilfs_btree_node_set_key(
                                nilfs_btree_get_nonroot_node(path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
                                nilfs_btnode_mark_dirty(path[level].bp_bh);
-                        unlock_buffer(path[level].bp_bh);
                } while ((path[level].bp_index == 0) &&
                         (++level < nilfs_btree_height(btree) - 1));
        }
@@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
        struct nilfs_btree_node *node;
        if (level < nilfs_btree_height(btree) - 1) {
-                lock_buffer(path[level].bp_bh);
                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
-                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
@@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *left;
        int nchildren, lnchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *right;
        int nchildren, rnchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
@@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        __u64 newptr;
        int nchildren, n, move;
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        newkey = nilfs_btree_node_get_key(right, 0);
        newptr = path[level].bp_newreq.bpr_ptr;
@@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        struct nilfs_btree_node *root, *child;
        int n;
-        lock_buffer(path[level].bp_sib_bh);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_sib_node(path, level);
@@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
@@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                stats->bs_nblocks++;
-                lock_buffer(bh);
                nilfs_btree_node_init(btree,
                                      (struct nilfs_btree_node *)bh->b_data,
                                      0, level, 0, NULL, NULL);
-                unlock_buffer(bh);
                path[level].bp_sib_bh = bh;
                path[level].bp_op = nilfs_btree_split;
        }
@@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        if (ret < 0)
                goto err_out_curr_node;
-        lock_buffer(bh);
        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
                              0, level, 0, NULL, NULL);
-        unlock_buffer(bh);
        path[level].bp_sib_bh = bh;
        path[level].bp_op = nilfs_btree_grow;
@@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
        struct nilfs_btree_node *node;
        if (level < nilfs_btree_height(btree) - 1) {
-                lock_buffer(path[level].bp_bh);
                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
-                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(node, 0));
@@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(right, 0));
@@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
@@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
@@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
-        lock_buffer(path[level].bp_sib_bh);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
@@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
-        unlock_buffer(path[level].bp_bh);
-        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
@@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
-        lock_buffer(path[level].bp_bh);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_nonroot_node(path, level);
@@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_node_set_level(root, level);
        n = nilfs_btree_node_get_nchildren(child);
        nilfs_btree_node_move_left(btree, root, child, n);
-        unlock_buffer(path[level].bp_bh);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
@@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
                /* create child node at level 1 */
-                lock_buffer(bh);
                node = (struct nilfs_btree_node *)bh->b_data;
                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
                nilfs_btree_node_insert(btree, node,
@@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                if (!nilfs_bmap_dirty(bmap))
                        nilfs_bmap_set_dirty(bmap);
-                unlock_buffer(bh);
                brelse(bh);
                /* create root node at level 2 */
@@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < NILFS_BTREE_LEVEL_MAX;
             level++)
-                list_splice(&lists[level], listp->prev);
+                list_splice_tail(&lists[level], listp);
 }
 static int nilfs_btree_assign_p(struct nilfs_btree *btree,
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbbc6b64..4b82d84ade75 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 /**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
-        __u8 bn_flags;
-        __u8 bn_level;
-        __le16 bn_nchildren;
-        __le32 bn_pad;
-};
-/* flags */
-#define NILFS_BTREE_NODE_ROOT   0x01
-/* level */
-#define NILFS_BTREE_LEVEL_DATA          0
-#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX           14
-/**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
 */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3f5d5d06f53c..d5ad54e204a5 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -926,3 +926,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
        up_read(&NILFS_MDT(cpfile)->mi_sem);
        return ret;
 }
+/**
+ * nilfs_cpfile_read - read cpfile inode
+ * @cpfile: cpfile inode
+ * @raw_inode: on-disk cpfile inode
+ */
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
+{
+        return nilfs_read_inode_common(cpfile, raw_inode);
+}
+/**
+ * nilfs_cpfile_new - create cpfile
+ * @nilfs: nilfs object
+ * @cpsize: size of a checkpoint entry
+ */
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+{
+        struct inode *cpfile;
+        cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+        if (cpfile)
+                nilfs_mdt_set_entry_size(cpfile, cpsize,
+                                         sizeof(struct nilfs_cpfile_header));
+        return cpfile;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index debea896e701..bc0809e0ab43 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
                                size_t);
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1ff8e15bd36b..187dd07ba86c 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
 #define NILFS_CNO_MIN   ((__u64)1)
 #define NILFS_CNO_MAX   (~(__u64)0)
+struct nilfs_dat_info {
+        struct nilfs_mdt_info mi;
+        struct nilfs_palloc_cache palloc_cache;
+};
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+        return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
 static int nilfs_dat_prepare_entry(struct inode *dat,
                                   struct nilfs_palloc_req *req, int create)
 {
@@ -425,3 +435,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
        return nvi;
 }
+/**
+ * nilfs_dat_read - read dat inode
+ * @dat: dat inode
+ * @raw_inode: on-disk dat inode
+ */
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
+{
+        return nilfs_read_inode_common(dat, raw_inode);
+}
+/**
+ * nilfs_dat_new - create dat file
+ * @nilfs: nilfs object
+ * @entry_size: size of a dat entry
+ */
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+{
+        static struct lock_class_key dat_lock_key;
+        struct inode *dat;
+        struct nilfs_dat_info *di;
+        int err;
+        dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+        if (dat) {
+                err = nilfs_palloc_init_blockgroup(dat, entry_size);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(dat);
+                        return NULL;
+                }
+                di = NILFS_DAT_I(dat);
+                lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+                nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+        }
+        return dat;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 406070d3ff49..d31c3aab0efe 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e097099bfc8f..76d803e060a9 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page,
                                 NULL, nilfs_get_block);
 }
-static int nilfs_commit_chunk(struct page *page,
+static void nilfs_commit_chunk(struct page *page,
-                              struct address_space *mapping,
+                               struct address_space *mapping,
-                              unsigned from, unsigned to)
+                               unsigned from, unsigned to)
 {
        struct inode *dir = mapping->host;
        struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
@@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page,
        nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
        copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
-        if (pos + copied > dir->i_size) {
+        if (pos + copied > dir->i_size)
                i_size_write(dir, pos + copied);
-                mark_inode_dirty(dir);
-        }
        if (IS_DIRSYNC(dir))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
        err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+        WARN_ON(err); /* do not happen */
        unlock_page(page);
-        return err;
 }
 static void nilfs_check_page(struct page *page)
@@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
        BUG_ON(err);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
-        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_commit_chunk(page, mapping, from, to);
        nilfs_put_page(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(dir);
 }
 /*
@@ -548,10 +545,10 @@ got_it:
        memcpy(de->name, name, namelen);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
-        err = nilfs_commit_chunk(page, page->mapping, from, to);
+        nilfs_commit_chunk(page, page->mapping, from, to);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(dir);
+        nilfs_mark_inode_dirty(dir);
        /* OFFSET_CACHE */
 out_put:
        nilfs_put_page(page);
@@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        if (pde)
                pde->rec_len = cpu_to_le16(to - from);
        dir->inode = 0;
-        err = nilfs_commit_chunk(page, mapping, from, to);
+        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 /*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
-        mark_inode_dirty(inode);
 out:
        nilfs_put_page(page);
        return err;
@@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
        memcpy(de->name, "..\0", 4);
        nilfs_set_de_type(de, inode);
        kunmap_atomic(kaddr, KM_USER0);
-        err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+        nilfs_commit_chunk(page, mapping, 0, chunk_size);
 fail:
        page_cache_release(page);
        return err;
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5cee90..dd5f7e0a95f6 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
        nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+        nilfs_palloc_clear_cache(dat);
+        nilfs_palloc_clear_cache(gcdat);
        nilfs_clear_dirty_pages(mapping);
        nilfs_copy_back_pages(mapping, gmapping);
        /* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
        gcdat->i_state = I_CLEAR;
        gii->i_flags = 0;
+        nilfs_palloc_clear_cache(gcdat);
        truncate_inode_pages(gcdat->i_mapping, 0);
        truncate_inode_pages(&gii->i_btnode_cache, 0);
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e6de0a27ab5d..e16a6664dfa2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -149,7 +149,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
                                   __u64 vbn, struct buffer_head **out_bh)
 {
        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
-                                            vbn ? : pbn, pbn, out_bh, 0);
+                                            vbn ? : pbn, pbn, out_bh);
        if (ret == -EEXIST) /* internal code (cache hit) */
                ret = 0;
        return ret;
@@ -212,9 +212,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs)
 static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
                                   __u64 cno)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+        struct inode *inode;
        struct nilfs_inode_info *ii;
+        inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
        if (!inode)
                return NULL;
@@ -265,7 +266,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
 */
 void nilfs_clear_gcinode(struct inode *inode)
 {
-        nilfs_mdt_clear(inode);
        nilfs_mdt_destroy(inode);
 }
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401f209f..922d9dd42c8f 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
 #include "alloc.h"
 #include "ifile.h"
+struct nilfs_ifile_info {
+        struct nilfs_mdt_info mi;
+        struct nilfs_palloc_cache palloc_cache;
+};
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+        return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
 /**
 * nilfs_ifile_create_inode - create a new disk inode
 * @ifile: ifile inode
@@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
        }
        return err;
 }
+/**
+ * nilfs_ifile_new - create inode file
+ * @sbi: nilfs_sb_info struct
+ * @inode_size: size of an inode
+ */
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+{
+        struct inode *ifile;
+        int err;
+        ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+                              sizeof(struct nilfs_ifile_info));
+        if (ifile) {
+                err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+                if (unlikely(err)) {
+                        nilfs_mdt_destroy(ifile);
+                        return NULL;
+                }
+                nilfs_palloc_setup_cache(ifile,
+                                         &NILFS_IFILE_I(ifile)->palloc_cache);
+        }
+        return ifile;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index ecc3ba76db47..cbca32e498f2 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
 #endif  /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2a0a5a3ac134..7868cc122ac7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -97,6 +97,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
                        nilfs_transaction_abort(inode->i_sb);
                        goto out;
                }
+                nilfs_mark_inode_dirty(inode);
                nilfs_transaction_commit(inode->i_sb); /* never fails */
                /* Error handling should be detailed */
                set_buffer_new(bh_result);
@@ -322,7 +323,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
                                    nilfs_init_acl(), proper cancellation of
                                    above jobs should be considered */
-        mark_inode_dirty(inode);
        return inode;
 failed_acl:
@@ -525,7 +525,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
-        /* The buffer is guarded with lock_buffer() by the caller */
        if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
                memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
        set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -599,6 +598,7 @@ void nilfs_truncate(struct inode *inode)
        if (IS_SYNC(inode))
                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_mark_inode_dirty(inode);
        nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
        nilfs_transaction_commit(sb);
        /* May construct a logical segment and may fail in sync mode.
@@ -623,6 +623,7 @@ void nilfs_delete_inode(struct inode *inode)
                truncate_inode_pages(&inode->i_data, 0);
        nilfs_truncate_bmap(ii, 0);
+        nilfs_mark_inode_dirty(inode);
        nilfs_free_inode(inode);
        /* nilfs_free_inode() marks inode buffer dirty */
        if (IS_SYNC(inode))
@@ -745,9 +746,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
                              "failed to reget inode block.\n");
                return err;
        }
-        lock_buffer(ibh);
        nilfs_update_inode(inode, ibh);
-        unlock_buffer(ibh);
        nilfs_mdt_mark_buffer_dirty(ibh);
        nilfs_mdt_mark_dirty(sbi->s_ifile);
        brelse(ibh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f6326112d647..06713ffcc7f2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -186,7 +186,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 }
 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
-                                struct buffer_head **out_bh)
+                                int readahead, struct buffer_head **out_bh)
 {
        struct buffer_head *first_bh, *bh;
        unsigned long blkoff;
@@ -200,16 +200,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
        if (unlikely(err))
                goto failed;
-        blkoff = block + 1;
+        if (readahead) {
-        for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+                blkoff = block + 1;
-                err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+                for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
-                if (likely(!err || err == -EEXIST))
+                        err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
-                        brelse(bh);
+                        if (likely(!err || err == -EEXIST))
-                else if (err != -EBUSY)
+                                brelse(bh);
-                        break; /* abort readahead if bmap lookup failed */
+                        else if (err != -EBUSY)
+                                break;
-                if (!buffer_locked(first_bh))
+                                /* abort readahead if bmap lookup failed */
-                        goto out_no_wait;
+                        if (!buffer_locked(first_bh))
+                                goto out_no_wait;
+                }
        }
        wait_on_buffer(first_bh);
@@ -263,7 +265,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
        /* Should be rewritten with merging nilfs_mdt_read_block() */
 retry:
-        ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+        ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
        if (!create || ret != -ENOENT)
                return ret;
@@ -371,7 +373,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
        struct buffer_head *bh;
        int err;
-        err = nilfs_mdt_read_block(inode, block, &bh);
+        err = nilfs_mdt_read_block(inode, block, 0, &bh);
        if (unlikely(err))
                return err;
        nilfs_mark_buffer_dirty(bh);
@@ -445,9 +447,17 @@ static const struct file_operations def_mdt_fops;
 * longer than those of the super block structs; they may continue for
 * several consecutive mounts/umounts.  This would need discussions.
 */
+/**
+ * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
+ * @nilfs: nilfs object
+ * @sb: super block instance the metadata file belongs to
+ * @ino: inode number
+ * @gfp_mask: gfp mask for data pages
+ * @objsz: size of the private object attached to inode->i_private
+ */
 struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-                     ino_t ino, gfp_t gfp_mask)
+                     ino_t ino, gfp_t gfp_mask, size_t objsz)
 {
        struct inode *inode = nilfs_alloc_inode_common(nilfs);
@@ -455,8 +465,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
                return NULL;
        else {
                struct address_space * const mapping = &inode->i_data;
-                struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+                struct nilfs_mdt_info *mi;
+                mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
                if (!mi) {
                        nilfs_destroy_inode(inode);
                        return NULL;
@@ -513,11 +524,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-                            ino_t ino)
+                            ino_t ino, size_t objsz)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+        struct inode *inode;
-                                                   NILFS_MDT_GFP);
+        inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
        if (!inode)
                return NULL;
@@ -544,14 +555,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
                &NILFS_I(orig)->i_btnode_cache;
 }
-void nilfs_mdt_clear(struct inode *inode)
+static void nilfs_mdt_clear(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
        invalidate_mapping_pages(inode->i_mapping, 0, -1);
        truncate_inode_pages(inode->i_mapping, 0);
-        nilfs_bmap_clear(ii->i_bmap);
+        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+                nilfs_bmap_clear(ii->i_bmap);
        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
@@ -559,6 +571,10 @@ void nilfs_mdt_destroy(struct inode *inode)
 {
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        if (mdi->mi_palloc_cache)
+                nilfs_palloc_destroy_cache(inode);
+        nilfs_mdt_clear(inode);
        kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
        kfree(mdi);
        nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 431599733c9b..6c4bbb0470fc 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
 * @mi_entry_size: size of an entry
 * @mi_first_entry_offset: offset to the first entry
 * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
 * @mi_blocks_per_group: number of blocks in a group
 * @mi_blocks_per_desc_block: number of blocks per descriptor block
 */
@@ -46,6 +47,7 @@ struct nilfs_mdt_info {
        unsigned                mi_entry_size;
        unsigned                mi_first_entry_offset;
        unsigned long           mi_entries_per_block;
+        struct nilfs_palloc_cache *mi_palloc_cache;
        unsigned long           mi_blocks_per_group;
        unsigned long           mi_blocks_per_desc_block;
 };
@@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+                            size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-                                   ino_t, gfp_t);
+                                   ino_t, gfp_t, size_t);
 void nilfs_mdt_destroy(struct inode *);
-void nilfs_mdt_clear(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
@@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
 #define nilfs_mdt_bgl_lock(inode, bg) \
        (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
-static inline int
-nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
-                            unsigned n)
-{
-        return nilfs_read_inode_common(
-                inode, (struct nilfs_inode *)(bh->b_data + n));
-}
-static inline void
-nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
-                             unsigned n)
-{
-        nilfs_write_inode_common(
-                inode, (struct nilfs_inode *)(bh->b_data + n), 1);
-}
 #endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ed02e886fa79..07ba838ef089 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,7 +120,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_op = &nilfs_file_inode_operations;
                inode->i_fop = &nilfs_file_operations;
                inode->i_mapping->a_ops = &nilfs_aops;
-                mark_inode_dirty(inode);
+                nilfs_mark_inode_dirty(inode);
                err = nilfs_add_nondir(dentry, inode);
        }
        if (!err)
@@ -148,7 +148,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
-                mark_inode_dirty(inode);
+                nilfs_mark_inode_dirty(inode);
                err = nilfs_add_nondir(dentry, inode);
        }
        if (!err)
@@ -188,7 +188,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_fail;
        /* mark_inode_dirty(inode); */
-        /* nilfs_new_inode() and page_symlink() do this */
+        /* page_symlink() do this */
        err = nilfs_add_nondir(dentry, inode);
 out:
@@ -200,7 +200,8 @@ out:
        return err;
 out_fail:
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
+        nilfs_mark_inode_dirty(inode);
        iput(inode);
        goto out;
 }
@@ -245,7 +246,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                return err;
-        inode_inc_link_count(dir);
+        inc_nlink(dir);
        inode = nilfs_new_inode(dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
@@ -256,7 +257,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode->i_fop = &nilfs_dir_operations;
        inode->i_mapping->a_ops = &nilfs_aops;
-        inode_inc_link_count(inode);
+        inc_nlink(inode);
        err = nilfs_make_empty(inode, dir);
        if (err)
@@ -266,6 +267,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                goto out_fail;
+        nilfs_mark_inode_dirty(inode);
        d_instantiate(dentry, inode);
 out:
        if (!err)
@@ -276,26 +278,23 @@ out:
        return err;
 out_fail:
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
+        nilfs_mark_inode_dirty(inode);
        iput(inode);
 out_dir:
-        inode_dec_link_count(dir);
+        drop_nlink(dir);
+        nilfs_mark_inode_dirty(dir);
        goto out;
 }
-static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode;
        struct nilfs_dir_entry *de;
        struct page *page;
-        struct nilfs_transaction_info ti;
        int err;
-        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
-        if (err)
-                return err;
        err = -ENOENT;
        de = nilfs_find_entry(dir, dentry, &page);
        if (!de)
@@ -317,12 +316,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
                goto out;
        inode->i_ctime = dir->i_ctime;
-        inode_dec_link_count(inode);
+        drop_nlink(inode);
        err = 0;
 out:
-        if (!err)
+        return err;
+}
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct nilfs_transaction_info ti;
+        int err;
+        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+        if (err)
+                return err;
+        err = nilfs_do_unlink(dir, dentry);
+        if (!err) {
+                nilfs_mark_inode_dirty(dir);
+                nilfs_mark_inode_dirty(dentry->d_inode);
                err = nilfs_transaction_commit(dir->i_sb);
-        else
+        } else
                nilfs_transaction_abort(dir->i_sb);
        return err;
@@ -340,11 +355,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
        err = -ENOTEMPTY;
        if (nilfs_empty_dir(inode)) {
-                err = nilfs_unlink(dir, dentry);
+                err = nilfs_do_unlink(dir, dentry);
                if (!err) {
                        inode->i_size = 0;
-                        inode_dec_link_count(inode);
+                        drop_nlink(inode);
-                        inode_dec_link_count(dir);
+                        nilfs_mark_inode_dirty(inode);
+                        drop_nlink(dir);
+                        nilfs_mark_inode_dirty(dir);
                }
        }
        if (!err)
@@ -395,42 +412,48 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
+                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
+                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
                        drop_nlink(new_inode);
-                inode_dec_link_count(new_inode);
+                drop_nlink(new_inode);
+                nilfs_mark_inode_dirty(new_inode);
        } else {
                if (dir_de) {
                        err = -EMLINK;
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
+                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
                if (err) {
-                        inode_dec_link_count(old_inode);
+                        drop_nlink(old_inode);
+                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
                }
-                if (dir_de)
+                if (dir_de) {
-                        inode_inc_link_count(new_dir);
+                        inc_nlink(new_dir);
+                        nilfs_mark_inode_dirty(new_dir);
+                }
        }
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-                inode_dec_link_count(old_dir);
+                drop_nlink(old_dir);
        }
+        nilfs_mark_inode_dirty(old_dir);
+        nilfs_mark_inode_dirty(old_inode);
        err = nilfs_transaction_commit(old_dir->i_sb);
        return err;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6dc83591d118..c9c96c7825dc 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -770,14 +770,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
                nilfs_finish_roll_forward(nilfs, sbi, ri);
        }
-        nilfs_detach_checkpoint(sbi);
-        return 0;
 failed:
        nilfs_detach_checkpoint(sbi);
-        nilfs_mdt_clear(nilfs->ns_cpfile);
-        nilfs_mdt_clear(nilfs->ns_sufile);
-        nilfs_mdt_clear(nilfs->ns_dat);
        return err;
 }
@@ -804,6 +798,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        struct nilfs_segsum_info ssi;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
+        sector_t b, end;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
@@ -819,6 +814,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        /* Calculate range of segment */
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+        /* Read ahead segment */
+        b = seg_start;
+        while (b <= seg_end)
+                sb_breadahead(sbi->s_super, b++);
        for (;;) {
                /* Load segment summary */
                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
@@ -841,14 +841,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_nextnum = nextnum;
                empty_seg = 0;
+                if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+                        /* This will never happen because a superblock
+                           (last_segment) always points to a pseg
+                           having a super root. */
+                        ret = NILFS_SEG_FAIL_CONSISTENCY;
+                        goto failed;
+                }
+                if (pseg_start == seg_start) {
+                        nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+                        while (b <= end)
+                                sb_breadahead(sbi->s_super, b++);
+                }
                if (!NILFS_SEG_HAS_SR(&ssi)) {
-                        if (!scan_newer) {
-                                /* This will never happen because a superblock
-                                   (last_segment) always points to a pseg
-                                   having a super root. */
-                                ret = NILFS_SEG_FAIL_CONSISTENCY;
-                                goto failed;
-                        }
                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
                                ri->ri_lsegs_start = pseg_start;
                                ri->ri_lsegs_start_seq = seg_seq;
@@ -919,7 +925,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 super_root_found:
        /* Updating pointers relating to the latest checkpoint */
-        list_splice(&segments, ri->ri_used_segments.prev);
+        list_splice_tail(&segments, &ri->ri_used_segments);
        nilfs->ns_last_pseg = sr_pseg_start;
        nilfs->ns_last_seq = nilfs->ns_seg_seq;
        nilfs->ns_last_cno = ri->ri_cno;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e6d9e37fa241..645c78656aa0 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -24,10 +24,22 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/crc32.h>
+#include <linux/backing-dev.h>
 #include "page.h"
 #include "segbuf.h"
+struct nilfs_write_info {
+        struct the_nilfs       *nilfs;
+        struct bio             *bio;
+        int                     start, end; /* The region to be submitted */
+        int                     rest_blocks;
+        int                     max_pages;
+        int                     nr_vecs;
+        sector_t                blocknr;
+};
 static struct kmem_cache *nilfs_segbuf_cachep;
 static void nilfs_segbuf_init_once(void *obj)
@@ -63,6 +75,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        init_completion(&segbuf->sb_bio_event);
+        atomic_set(&segbuf->sb_err, 0);
+        segbuf->sb_nbio = 0;
        return segbuf;
 }
@@ -83,6 +100,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
 }
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+                           struct nilfs_segment_buffer *prev)
+{
+        segbuf->sb_segnum = prev->sb_segnum;
+        segbuf->sb_fseg_start = prev->sb_fseg_start;
+        segbuf->sb_fseg_end = prev->sb_fseg_end;
+        segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+        segbuf->sb_rest_blocks =
+                segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
                                  __u64 nextnum, struct the_nilfs *nilfs)
 {
@@ -132,8 +165,6 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
-        segbuf->sb_io_error = 0;
        return 0;
 }
@@ -219,7 +250,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
-void nilfs_release_buffers(struct list_head *list)
+static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -241,13 +272,56 @@ void nilfs_release_buffers(struct list_head *list)
        }
 }
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list)
+                nilfs_segbuf_clear(segbuf);
+}
+void nilfs_truncate_logs(struct list_head *logs,
+                         struct nilfs_segment_buffer *last)
+{
+        struct nilfs_segment_buffer *n, *segbuf;
+        segbuf = list_prepare_entry(last, logs, sb_list);
+        list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+                list_del_init(&segbuf->sb_list);
+                nilfs_segbuf_clear(segbuf);
+                nilfs_segbuf_free(segbuf);
+        }
+}
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+        struct nilfs_segment_buffer *segbuf;
+        int err;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                err = nilfs_segbuf_wait(segbuf);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
 /*
 * BIO operations
 */
 static void nilfs_end_bio_write(struct bio *bio, int err)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct nilfs_write_info *wi = bio->bi_private;
+        struct nilfs_segment_buffer *segbuf = bio->bi_private;
        if (err == -EOPNOTSUPP) {
                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
@@ -256,21 +330,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
        }
        if (!uptodate)
-                atomic_inc(&wi->err);
+                atomic_inc(&segbuf->sb_err);
        bio_put(bio);
-        complete(&wi->bio_event);
+        complete(&segbuf->sb_bio_event);
 }
-static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+                                   struct nilfs_write_info *wi, int mode)
 {
        struct bio *bio = wi->bio;
        int err;
-        if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+        if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
-                wait_for_completion(&wi->bio_event);
+                wait_for_completion(&segbuf->sb_bio_event);
-                wi->nbio--;
+                segbuf->sb_nbio--;
-                if (unlikely(atomic_read(&wi->err))) {
+                if (unlikely(atomic_read(&segbuf->sb_err))) {
                        bio_put(bio);
                        err = -EIO;
                        goto failed;
@@ -278,7 +353,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
        }
        bio->bi_end_io = nilfs_end_bio_write;
-        bio->bi_private = wi;
+        bio->bi_private = segbuf;
        bio_get(bio);
        submit_bio(mode, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
@@ -286,7 +361,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
                err = -EOPNOTSUPP;
                goto failed;
        }
-        wi->nbio++;
+        segbuf->sb_nbio++;
        bio_put(bio);
        wi->bio = NULL;
@@ -301,17 +376,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
 }
 /**
- * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @sb: super block
+ * @nilfs: nilfs object
- * @start: beginning disk block number of this BIO.
+ * @start: start block number of the bio
 * @nr_vecs: request size of page vector.
 *
- * alloc_seg_bio() allocates a new BIO structure and initialize it.
- *
 * Return Value: On success, pointer to the struct bio is returned.
 * On error, NULL is returned.
 */
-static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
                                       int nr_vecs)
 {
        struct bio *bio;
@@ -322,36 +395,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
                        bio = bio_alloc(GFP_NOIO, nr_vecs);
        }
        if (likely(bio)) {
-                bio->bi_bdev = sb->s_bdev;
+                bio->bi_bdev = nilfs->ns_bdev;
-                bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+                bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
        }
        return bio;
 }
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
-                                struct nilfs_write_info *wi)
+                                       struct nilfs_write_info *wi)
 {
        wi->bio = NULL;
        wi->rest_blocks = segbuf->sb_sum.nblocks;
-        wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+        wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
        wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
        wi->start = wi->end = 0;
-        wi->nbio = 0;
        wi->blocknr = segbuf->sb_pseg_start;
-        atomic_set(&wi->err, 0);
-        init_completion(&wi->bio_event);
 }
-static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
-                           int mode)
+                                  struct nilfs_write_info *wi,
+                                  struct buffer_head *bh, int mode)
 {
        int len, err;
        BUG_ON(wi->nr_vecs <= 0);
 repeat:
        if (!wi->bio) {
-                wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+                wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
                                              wi->nr_vecs);
                if (unlikely(!wi->bio))
                        return -ENOMEM;
@@ -363,76 +433,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
                return 0;
        }
        /* bio is FULL */
-        err = nilfs_submit_seg_bio(wi, mode);
+        err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
        /* never submit current bh */
        if (likely(!err))
                goto repeat;
        return err;
 }
+/**
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
 int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-                       struct nilfs_write_info *wi)
+                       struct the_nilfs *nilfs)
 {
+        struct nilfs_write_info wi;
        struct buffer_head *bh;
-        int res, rw = WRITE;
+        int res = 0, rw = WRITE;
+        wi.nilfs = nilfs;
+        nilfs_segbuf_prepare_write(segbuf, &wi);
        list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
-                res = nilfs_submit_bh(wi, bh, rw);
+                res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
                if (unlikely(res))
                        goto failed_bio;
        }
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                res = nilfs_submit_bh(wi, bh, rw);
+                res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
                if (unlikely(res))
                        goto failed_bio;
        }
-        if (wi->bio) {
+        if (wi.bio) {
                /*
                 * Last BIO is always sent through the following
                 * submission.
                 */
                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-                res = nilfs_submit_seg_bio(wi, rw);
+                res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
-                if (unlikely(res))
-                        goto failed_bio;
        }
-        res = 0;
- out:
-        return res;
 failed_bio:
-        atomic_inc(&wi->err);
+        return res;
-        goto out;
 }
 /**
 * nilfs_segbuf_wait - wait for completion of requested BIOs
- * @wi: nilfs_write_info
+ * @segbuf: segment buffer
 *
 * Return Value: On Success, 0 is returned. On Error, one of the following
 * negative error code is returned.
 *
 * %-EIO - I/O error
 */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
-                      struct nilfs_write_info *wi)
 {
        int err = 0;
-        if (!wi->nbio)
+        if (!segbuf->sb_nbio)
                return 0;
        do {
-                wait_for_completion(&wi->bio_event);
+                wait_for_completion(&segbuf->sb_bio_event);
-        } while (--wi->nbio > 0);
+        } while (--segbuf->sb_nbio > 0);
-        if (unlikely(atomic_read(&wi->err) > 0)) {
+        if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
                printk(KERN_ERR "NILFS: IO error writing segment\n");
                err = -EIO;
-                segbuf->sb_io_error = 1;
        }
        return err;
 }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 0c3076f4e592..6af1630fb401 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/completion.h>
-#include <linux/backing-dev.h>
 /**
 * struct nilfs_segsum_info - On-memory segment summary
@@ -77,7 +76,9 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
- * @sb_io_error: I/O error status
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
 */
 struct nilfs_segment_buffer {
        struct super_block     *sb_super;
@@ -96,7 +97,9 @@ struct nilfs_segment_buffer {
        struct list_head        sb_payload_buffers; /* including super root */
        /* io status */
-        int                     sb_io_error;
+        int                     sb_nbio;
+        atomic_t                sb_err;
+        struct completion       sb_bio_event;
 };
 #define NILFS_LIST_SEGBUF(head)  \
@@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
                      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
@@ -161,41 +166,18 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
        segbuf->sb_sum.nfileblk++;
 }
-void nilfs_release_buffers(struct list_head *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+                       struct the_nilfs *nilfs);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+                         struct nilfs_segment_buffer *last);
+int nilfs_wait_on_logs(struct list_head *logs);
+static inline void nilfs_destroy_logs(struct list_head *logs)
 {
-        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+        nilfs_truncate_logs(logs, NULL);
-        nilfs_release_buffers(&segbuf->sb_payload_buffers);
 }
-struct nilfs_write_info {
-        struct bio             *bio;
-        int                     start, end; /* The region to be submitted */
-        int                     rest_blocks;
-        int                     max_pages;
-        int                     nr_vecs;
-        sector_t                blocknr;
-        int                     nbio;
-        atomic_t                err;
-        struct completion       bio_event;
-                                /* completion event of segment write */
-        /*
-         * The following fields must be set explicitly
-         */
-        struct super_block     *sb;
-        struct backing_dev_info *bdi; /* backing dev info */
-        struct buffer_head     *bh_sr;
-};
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
-                                struct nilfs_write_info *);
-int nilfs_segbuf_write(struct nilfs_segment_buffer *,
-                       struct nilfs_write_info *);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
-                      struct nilfs_write_info *);
 #endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6eff66a070d5..17584c524486 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -974,12 +974,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                              nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
        raw_sr->sr_flags = 0;
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
-                nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+                                 NILFS_SR_DAT_OFFSET(isz), 1);
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
-                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+                                 NILFS_SR_CPFILE_OFFSET(isz), 1);
-        nilfs_mdt_write_inode_direct(
+        nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
-                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+                                 NILFS_SR_SUFILE_OFFSET(isz), 1);
 }
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,73 +1273,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        return err;
 }
-static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+/**
-{
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
-        struct buffer_head *bh_su;
+ * @sci: nilfs_sc_info
-        struct nilfs_segment_usage *raw_su;
+ * @nilfs: nilfs object
-        int err;
+ */
-        err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
-        if (unlikely(err))
-                return err;
-        nilfs_mdt_mark_buffer_dirty(bh_su);
-        nilfs_mdt_mark_dirty(sufile);
-        nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
-        return 0;
-}
 static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
                                            struct the_nilfs *nilfs)
 {
-        struct nilfs_segment_buffer *segbuf, *n;
+        struct nilfs_segment_buffer *segbuf, *prev;
        __u64 nextnum;
-        int err;
+        int err, alloc = 0;
-        if (list_empty(&sci->sc_segbufs)) {
+        segbuf = nilfs_segbuf_new(sci->sc_super);
-                segbuf = nilfs_segbuf_new(sci->sc_super);
+        if (unlikely(!segbuf))
-                if (unlikely(!segbuf))
+                return -ENOMEM;
-                        return -ENOMEM;
-                list_add(&segbuf->sb_list, &sci->sc_segbufs);
+        if (list_empty(&sci->sc_write_logs)) {
-        } else
+                nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
-                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+                                 nilfs->ns_pseg_offset, nilfs);
+                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                        nilfs_shift_to_next_segment(nilfs);
+                        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+                }
-        nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+                segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
-                         nilfs);
+                nextnum = nilfs->ns_nextnum;
-        if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                if (nilfs->ns_segnum == nilfs->ns_nextnum)
-                nilfs_shift_to_next_segment(nilfs);
+                        /* Start from the head of a new full segment */
-                nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+                        alloc++;
+        } else {
+                /* Continue logs */
+                prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+                nilfs_segbuf_map_cont(segbuf, prev);
+                segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+                nextnum = prev->sb_nextnum;
+                if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+                        nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+                        segbuf->sb_sum.seg_seq++;
+                        alloc++;
+                }
        }
-        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
-        err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+        err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
-        if (unlikely(err))
+        if (err)
-                return err;
+                goto failed;
-        if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+        if (alloc) {
-                /* Start from the head of a new full segment */
                err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
-                if (unlikely(err))
+                if (err)
-                        return err;
+                        goto failed;
-        } else
+        }
-                nextnum = nilfs->ns_nextnum;
-        segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
        nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
-        /* truncating segment buffers */
+        BUG_ON(!list_empty(&sci->sc_segbufs));
-        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+        list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
-                                          sb_list) {
+        sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
-        }
        return 0;
+ failed:
+        nilfs_segbuf_free(segbuf);
+        return err;
 }
 static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
                                         struct the_nilfs *nilfs, int nadd)
 {
-        struct nilfs_segment_buffer *segbuf, *prev, *n;
+        struct nilfs_segment_buffer *segbuf, *prev;
        struct inode *sufile = nilfs->ns_sufile;
        __u64 nextnextnum;
        LIST_HEAD(list);
@@ -1352,7 +1354,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
         * not be dirty.  The following call ensures that the buffer is dirty
         * and will pin the buffer on memory until the sufile is written.
         */
-        err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+        err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
        if (unlikely(err))
                return err;
@@ -1378,33 +1380,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
                list_add_tail(&segbuf->sb_list, &list);
                prev = segbuf;
        }
-        list_splice(&list, sci->sc_segbufs.prev);
+        list_splice_tail(&list, &sci->sc_segbufs);
        return 0;
 failed_segbuf:
        nilfs_segbuf_free(segbuf);
 failed:
-        list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+        list_for_each_entry(segbuf, &list, sb_list) {
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
        }
+        nilfs_destroy_logs(&list);
        return err;
 }
-static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+static void nilfs_free_incomplete_logs(struct list_head *logs,
-                                                   struct the_nilfs *nilfs)
+                                       struct the_nilfs *nilfs)
 {
-        struct nilfs_segment_buffer *segbuf;
+        struct nilfs_segment_buffer *segbuf, *prev;
-        int ret, done = 0;
+        struct inode *sufile = nilfs->ns_sufile;
+        int ret;
-        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_FIRST_SEGBUF(logs);
        if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
-                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret); /* never fails */
        }
-        if (segbuf->sb_io_error) {
+        if (atomic_read(&segbuf->sb_err)) {
                /* Case 1: The first segment failed */
                if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
                        /* Case 1a:  Partial segment appended into an existing
@@ -1413,106 +1415,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
                                                segbuf->sb_fseg_end);
                else /* Case 1b:  New full segment */
                        set_nilfs_discontinued(nilfs);
-                done++;
        }
-        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+        prev = segbuf;
-                ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+        list_for_each_entry_continue(segbuf, logs, sb_list) {
-                WARN_ON(ret); /* never fails */
+                if (prev->sb_nextnum != segbuf->sb_nextnum) {
-                if (!done && segbuf->sb_io_error) {
+                        ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
-                        if (segbuf->sb_segnum != nilfs->ns_nextnum)
+                        WARN_ON(ret); /* never fails */
-                                /* Case 2: extended segment (!= next) failed */
-                                nilfs_sufile_set_error(nilfs->ns_sufile,
-                                                       segbuf->sb_segnum);
-                        done++;
-                }
-        }
-}
-static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
-{
-        struct nilfs_segment_buffer *segbuf;
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
-                nilfs_segbuf_clear(segbuf);
-        sci->sc_super_root = NULL;
-}
-static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
-{
-        struct nilfs_segment_buffer *segbuf;
-        while (!list_empty(&sci->sc_segbufs)) {
-                segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-                list_del_init(&segbuf->sb_list);
-                nilfs_segbuf_free(segbuf);
-        }
-        /* sci->sc_curseg = NULL; */
-}
-static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
-                                           struct the_nilfs *nilfs, int err)
-{
-        if (unlikely(err)) {
-                nilfs_segctor_free_incomplete_segments(sci, nilfs);
-                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
-                        int ret;
-                        ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
-                                                        sci->sc_freesegs,
-                                                        sci->sc_nfreesegs,
-                                                        NULL);
-                        WARN_ON(ret); /* do not happen */
                }
+                if (atomic_read(&segbuf->sb_err) &&
+                    segbuf->sb_segnum != nilfs->ns_nextnum)
+                        /* Case 2: extended segment (!= next) failed */
+                        nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+                prev = segbuf;
        }
-        nilfs_segctor_clear_segment_buffers(sci);
 }
 static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
                                          struct inode *sufile)
 {
        struct nilfs_segment_buffer *segbuf;
-        struct buffer_head *bh_su;
-        struct nilfs_segment_usage *raw_su;
        unsigned long live_blocks;
        int ret;
        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-                                                     &raw_su, &bh_su);
-                WARN_ON(ret); /* always succeed because bh_su is dirty */
                live_blocks = segbuf->sb_sum.nblocks +
                        (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
-                raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                raw_su->su_nblocks = cpu_to_le32(live_blocks);
+                                                     live_blocks,
-                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+                                                     sci->sc_seg_ctime);
-                                               bh_su);
+                WARN_ON(ret); /* always succeed because the segusage is dirty */
        }
 }
-static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
-                                          struct inode *sufile)
 {
        struct nilfs_segment_buffer *segbuf;
-        struct buffer_head *bh_su;
-        struct nilfs_segment_usage *raw_su;
        int ret;
-        segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_FIRST_SEGBUF(logs);
-        ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+        ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                                             &raw_su, &bh_su);
+                                             segbuf->sb_pseg_start -
-        WARN_ON(ret); /* always succeed because bh_su is dirty */
+                                             segbuf->sb_fseg_start, 0);
-        raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+        WARN_ON(ret); /* always succeed because the segusage is dirty */
-                                         segbuf->sb_fseg_start);
-        nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
-        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+        list_for_each_entry_continue(segbuf, logs, sb_list) {
-                ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+                ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
-                                                     &raw_su, &bh_su);
+                                                     0, 0);
                WARN_ON(ret); /* always succeed */
-                raw_su->su_nblocks = 0;
-                nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-                                               bh_su);
        }
 }
@@ -1520,17 +1470,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
                                            struct nilfs_segment_buffer *last,
                                            struct inode *sufile)
 {
-        struct nilfs_segment_buffer *segbuf = last, *n;
+        struct nilfs_segment_buffer *segbuf = last;
        int ret;
-        list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+        list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-                                          sb_list) {
-                list_del_init(&segbuf->sb_list);
                sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
                ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
                WARN_ON(ret);
-                nilfs_segbuf_free(segbuf);
        }
+        nilfs_truncate_logs(&sci->sc_segbufs, last);
 }
@@ -1569,7 +1517,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                                                        NULL);
                        WARN_ON(err); /* do not happen */
                }
-                nilfs_segctor_clear_segment_buffers(sci);
+                nilfs_clear_logs(&sci->sc_segbufs);
                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
                if (unlikely(err))
@@ -1814,26 +1762,18 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 }
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
-                               struct backing_dev_info *bdi)
+                               struct the_nilfs *nilfs)
 {
        struct nilfs_segment_buffer *segbuf;
-        struct nilfs_write_info wi;
+        int ret = 0;
-        int err, res;
-        wi.sb = sci->sc_super;
-        wi.bh_sr = sci->sc_super_root;
-        wi.bdi = bdi;
        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_prepare_write(segbuf, &wi);
+                ret = nilfs_segbuf_write(segbuf, nilfs);
-                err = nilfs_segbuf_write(segbuf, &wi);
+                if (ret)
+                        break;
-                res = nilfs_segbuf_wait(segbuf, &wi);
-                err = err ? : res;
-                if (err)
-                        return err;
        }
-        return 0;
+        list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
+        return ret;
 }
 static void __nilfs_end_page_io(struct page *page, int err)
@@ -1911,15 +1851,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
        }
 }
-static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                                      struct page *failed_page, int err)
+                             struct buffer_head *bh_sr, int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
+        struct buffer_head *bh;
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+        if (list_empty(logs))
-                struct buffer_head *bh;
+                return;
+        list_for_each_entry(segbuf, logs, sb_list) {
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
                                    b_assoc_buffers) {
                        if (bh->b_page != bd_page) {
@@ -1931,7 +1873,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == bh_sr) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1941,7 +1883,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                        if (bh->b_page != fs_page) {
                                nilfs_end_page_io(fs_page, err);
                                if (fs_page && fs_page == failed_page)
-                                        goto done;
+                                        return;
                                fs_page = bh->b_page;
                        }
                }
@@ -1950,8 +1892,34 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
                end_page_writeback(bd_page);
        nilfs_end_page_io(fs_page, err);
- done:
+}
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+                                             struct the_nilfs *nilfs, int err)
+{
+        LIST_HEAD(logs);
+        int ret;
+        list_splice_tail_init(&sci->sc_write_logs, &logs);
+        ret = nilfs_wait_on_logs(&logs);
+        if (ret)
+                nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+        list_splice_tail_init(&sci->sc_segbufs, &logs);
+        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+        nilfs_free_incomplete_logs(&logs, nilfs);
        nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+        if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                sci->sc_freesegs,
+                                                sci->sc_nfreesegs,
+                                                NULL);
+                WARN_ON(ret); /* do not happen */
+        }
+        nilfs_destroy_logs(&logs);
+        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1973,7 +1941,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int update_sr = (sci->sc_super_root != NULL);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
                list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
@@ -2046,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        sci->sc_nblk_inc += sci->sc_nblk_this_inc;
-        segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+        segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
        nilfs_set_next_segment(nilfs, segbuf);
        if (update_sr) {
@@ -2057,10 +2025,23 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
                set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+                nilfs_segctor_clear_metadata_dirty(sci);
        } else
                clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
 }
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+        int ret;
+        ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+        if (!ret) {
+                nilfs_segctor_complete_write(sci);
+                nilfs_destroy_logs(&sci->sc_write_logs);
+        }
+        return ret;
+}
 static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                                        struct nilfs_sb_info *sbi)
 {
@@ -2173,7 +2154,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
-                        nilfs_segctor_end_construction(sci, nilfs, 1);
+                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
                }
@@ -2187,7 +2168,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (has_sr) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
-                                goto failed_to_make_up;
+                                goto failed_to_write;
                        nilfs_segctor_fill_in_super_root(sci, nilfs);
                }
@@ -2195,42 +2176,46 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
-                if (unlikely(err))
+                if (err) {
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                                         sci->sc_super_root, err);
                        goto failed_to_write;
+                }
                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
-                err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
                        goto failed_to_write;
-                nilfs_segctor_complete_write(sci);
+                if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+                    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
-                /* Commit segments */
+                        /*
-                if (has_sr)
+                         * At this point, we avoid double buffering
-                        nilfs_segctor_clear_metadata_dirty(sci);
+                         * for blocksize < pagesize because page dirty
+                         * flag is turned off during write and dirty
-                nilfs_segctor_end_construction(sci, nilfs, 0);
+                         * buffers are not properly collected for
+                         * pages crossing over segments.
+                         */
+                        err = nilfs_segctor_wait(sci);
+                        if (err)
+                                goto failed_to_write;
+                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+        sci->sc_super_root = NULL;
 out:
-        nilfs_segctor_destroy_segment_buffers(sci);
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
 failed_to_write:
-        nilfs_segctor_abort_write(sci, failed_page, err);
-        nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
- failed_to_make_up:
        if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                nilfs_redirty_inodes(&sci->sc_dirty_files);
 failed:
        if (nilfs_doing_gc())
                nilfs_redirty_inodes(&sci->sc_gc_inodes);
-        nilfs_segctor_end_construction(sci, nilfs, err);
+        nilfs_segctor_abort_construction(sci, nilfs, err);
        goto out;
 }
@@ -2559,7 +2544,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        sci->sc_freesegs = kbufs[4];
        sci->sc_nfreesegs = argv[4].v_nmembs;
-        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+        list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
        for (;;) {
                nilfs_segctor_accept(sci, &req);
@@ -2788,6 +2773,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        spin_lock_init(&sci->sc_state_lock);
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
+        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
@@ -2855,6 +2841,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
+        WARN_ON(!list_empty(&sci->sc_write_logs));
        down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0d2a475a741b..3d3ab2f9864c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -97,6 +97,7 @@ struct nilfs_segsum_pointer {
 * @sc_dsync_start: start byte offset of data pages
 * @sc_dsync_end: end byte offset of data pages (inclusive)
 * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
 * @sc_super_root: Pointer to the super root buffer
@@ -143,6 +144,7 @@ struct nilfs_sc_info {
        /* Segment buffers */
        struct list_head        sc_segbufs;
+        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
        struct buffer_head     *sc_super_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4a59cc..b6c36d0cc331 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -31,6 +31,16 @@
 #include "sufile.h"
+struct nilfs_sufile_info {
+        struct nilfs_mdt_info mi;
+        unsigned long ncleansegs;
+};
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+        return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
 static inline unsigned long
 nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
 {
@@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
                     max - curr + 1);
 }
-static inline struct nilfs_sufile_header *
-nilfs_sufile_block_get_header(const struct inode *sufile,
-                              struct buffer_head *bh,
-                              void *kaddr)
-{
-        return kaddr + bh_offset(bh);
-}
 static struct nilfs_segment_usage *
 nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
                                     struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 }
 /**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+        return NILFS_SUI(sufile)->ncleansegs;
+}
+/**
 * nilfs_sufile_updatev - modify multiple segment usages at a time
 * @sufile: inode of segment usage file
 * @segnumv: array of segment numbers
@@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        if (ret < 0)
                goto out_sem;
        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        header = kaddr + bh_offset(header_bh);
        ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        last_alloc = le64_to_cpu(header->sh_last_alloc);
        kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                        kunmap_atomic(kaddr, KM_USER0);
                        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-                        header = nilfs_sufile_block_get_header(
+                        header = kaddr + bh_offset(header_bh);
-                                sufile, header_bh, kaddr);
                        le64_add_cpu(&header->sh_ncleansegs, -1);
                        le64_add_cpu(&header->sh_ndirtysegs, 1);
                        header->sh_last_alloc = cpu_to_le64(segnum);
                        kunmap_atomic(kaddr, KM_USER0);
+                        NILFS_SUI(sufile)->ncleansegs--;
                        nilfs_mdt_mark_buffer_dirty(header_bh);
                        nilfs_mdt_mark_buffer_dirty(su_bh);
                        nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
        kunmap_atomic(kaddr, KM_USER0);
        nilfs_sufile_mod_counter(header_bh, -1, 1);
+        NILFS_SUI(sufile)->ncleansegs--;
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
        kunmap_atomic(kaddr, KM_USER0);
        nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+        NILFS_SUI(sufile)->ncleansegs -= clean;
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+        NILFS_SUI(sufile)->ncleansegs++;
        nilfs_mdt_mark_dirty(sufile);
 }
 /**
- * nilfs_sufile_get_segment_usage - get a segment usage
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
 * @sufile: inode of segment usage file
 * @segnum: segment number
- * @sup: pointer to segment usage
- * @bhp: pointer to buffer head
- *
- * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
- * specified by @segnum.
- *
- * Return Value: On success, 0 is returned, and the segment usage and the
- * buffer head of the buffer on which the segment usage is located are stored
- * in the place pointed by @sup and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
 */
-int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
-                                   struct nilfs_segment_usage **sup,
-                                   struct buffer_head **bhp)
 {
        struct buffer_head *bh;
-        struct nilfs_segment_usage *su;
-        void *kaddr;
        int ret;
-        /* segnum is 0 origin */
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
-        if (segnum >= nilfs_sufile_get_nsegments(sufile))
+        if (!ret) {
-                return -EINVAL;
+                nilfs_mdt_mark_buffer_dirty(bh);
-        down_write(&NILFS_MDT(sufile)->mi_sem);
+                nilfs_mdt_mark_dirty(sufile);
-        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
-        if (ret < 0)
-                goto out_sem;
-        kaddr = kmap(bh->b_page);
-        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
-        if (nilfs_segment_usage_error(su)) {
-                kunmap(bh->b_page);
                brelse(bh);
-                ret = -EINVAL;
-                goto out_sem;
        }
-        if (sup != NULL)
-                *sup = su;
-        *bhp = bh;
- out_sem:
-        up_write(&NILFS_MDT(sufile)->mi_sem);
        return ret;
 }
 /**
- * nilfs_sufile_put_segment_usage - put a segment usage
+ * nilfs_sufile_set_segment_usage - set usage of a segment
 * @sufile: inode of segment usage file
 * @segnum: segment number
- * @bh: buffer head
+ * @nblocks: number of live blocks in the segment
- *
+ * @modtime: modification time (option)
- * Description: nilfs_sufile_put_segment_usage() releases the segment usage
- * specified by @segnum. @bh must be the buffer head which have been returned
- * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
 */
-void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-                                    struct buffer_head *bh)
+                                   unsigned long nblocks, time_t modtime)
 {
-        kunmap(bh->b_page);
+        struct buffer_head *bh;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        int ret;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+        if (ret < 0)
+                goto out_sem;
+        kaddr = kmap_atomic(bh->b_page, KM_USER0);
+        su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+        WARN_ON(nilfs_segment_usage_error(su));
+        if (modtime)
+                su->su_lastmod = cpu_to_le64(modtime);
+        su->su_nblocks = cpu_to_le32(nblocks);
+        kunmap_atomic(kaddr, KM_USER0);
+        nilfs_mdt_mark_buffer_dirty(bh);
+        nilfs_mdt_mark_dirty(sufile);
        brelse(bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
 }
 /**
@@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
                goto out_sem;
        kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-        header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+        header = kaddr + bh_offset(header_bh);
        sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
        sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
        return ret;
 }
-/**
- * nilfs_sufile_get_ncleansegs - get the number of clean segments
- * @sufile: inode of segment usage file
- * @nsegsp: pointer to the number of clean segments
- *
- * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
- * segments.
- *
- * Return Value: On success, 0 is returned and the number of clean segments is
- * stored in the place pointed by @nsegsp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
-{
-        struct nilfs_sustat sustat;
-        int ret;
-        ret = nilfs_sufile_get_stat(sufile, &sustat);
-        if (ret == 0)
-                *nsegsp = sustat.ss_ncleansegs;
-        return ret;
-}
 void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
                               struct buffer_head *header_bh,
                               struct buffer_head *su_bh)
@@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
        nilfs_segment_usage_set_error(su);
        kunmap_atomic(kaddr, KM_USER0);
-        if (suclean)
+        if (suclean) {
                nilfs_sufile_mod_counter(header_bh, -1, 0);
+                NILFS_SUI(sufile)->ncleansegs--;
+        }
        nilfs_mdt_mark_buffer_dirty(su_bh);
        nilfs_mdt_mark_dirty(sufile);
 }
@@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
        up_read(&NILFS_MDT(sufile)->mi_sem);
        return ret;
 }
+/**
+ * nilfs_sufile_read - read sufile inode
+ * @sufile: sufile inode
+ * @raw_inode: on-disk sufile inode
+ */
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+{
+        struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+        struct buffer_head *header_bh;
+        struct nilfs_sufile_header *header;
+        void *kaddr;
+        int ret;
+        ret = nilfs_read_inode_common(sufile, raw_inode);
+        if (ret < 0)
+                return ret;
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (!ret) {
+                kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+                header = kaddr + bh_offset(header_bh);
+                sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+                kunmap_atomic(kaddr, KM_USER0);
+                brelse(header_bh);
+        }
+        return ret;
+}
+/**
+ * nilfs_sufile_new - create sufile
+ * @nilfs: nilfs object
+ * @susize: size of a segment usage entry
+ */
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+{
+        struct inode *sufile;
+        sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+                               sizeof(struct nilfs_sufile_info));
+        if (sufile)
+                nilfs_mdt_set_entry_size(sufile, susize,
+                                         sizeof(struct nilfs_sufile_header));
+        return sufile;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 0e99e5c0bd0f..15163b8aff7d 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
        return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
 }
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
-                                   struct nilfs_segment_usage **,
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-                                   struct buffer_head **);
+                                   unsigned long nblocks, time_t modtime);
-void nilfs_sufile_put_segment_usage(struct inode *, __u64,
-                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
@@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
 /**
 * nilfs_sufile_scrap - make a segment garbage
 * @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e66727dd0..5403b3ef3a42 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -363,14 +363,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
-        sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
+        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
        if (!sbi->s_ifile)
                return -ENOMEM;
-        err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
-        if (unlikely(err))
-                goto failed;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
                                          &bh_cp);
@@ -411,7 +407,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_mdt_clear(sbi->s_ifile);
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
        down_write(&nilfs->ns_super_sem);
@@ -419,22 +414,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
        up_write(&nilfs->ns_super_sem);
 }
-static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
-{
-        struct the_nilfs *nilfs = sbi->s_nilfs;
-        int err = 0;
-        down_write(&nilfs->ns_sem);
-        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-                nilfs->ns_mount_state |= NILFS_VALID_FS;
-                err = nilfs_commit_super(sbi, 1);
-                if (likely(!err))
-                        printk(KERN_INFO "NILFS: recovery complete.\n");
-        }
-        up_write(&nilfs->ns_sem);
-        return err;
-}
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
@@ -490,7 +469,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        if (!nilfs_test_opt(sbi, BARRIER))
-                seq_printf(seq, ",barrier=off");
+                seq_printf(seq, ",nobarrier");
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
@@ -500,6 +479,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",errors=panic");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
                seq_printf(seq, ",order=strict");
+        if (nilfs_test_opt(sbi, NORECOVERY))
+                seq_printf(seq, ",norecovery");
        return 0;
 }
@@ -568,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_barrier, Opt_snapshot, Opt_order,
+        Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
        Opt_err,
 };
@@ -576,25 +557,13 @@ static match_table_t tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
-        {Opt_barrier, "barrier=%s"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_snapshot, "cp=%u"},
        {Opt_order, "order=%s"},
+        {Opt_norecovery, "norecovery"},
        {Opt_err, NULL}
 };
-static int match_bool(substring_t *s, int *result)
-{
-        int len = s->to - s->from;
-        if (strncmp(s->from, "on", len) == 0)
-                *result = 1;
-        else if (strncmp(s->from, "off", len) == 0)
-                *result = 0;
-        else
-                return 1;
-        return 0;
-}
 static int parse_options(char *options, struct super_block *sb)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb)
                token = match_token(p, tokens, args);
                switch (token) {
-                case Opt_barrier:
+                case Opt_nobarrier:
-                        if (match_bool(&args[0], &option))
+                        nilfs_clear_opt(sbi, BARRIER);
-                                return 0;
-                        if (option)
-                                nilfs_set_opt(sbi, BARRIER);
-                        else
-                                nilfs_clear_opt(sbi, BARRIER);
                        break;
                case Opt_order:
                        if (strcmp(args[0].from, "relaxed") == 0)
@@ -647,6 +611,9 @@ static int parse_options(char *options, struct super_block *sb)
                        sbi->s_snapshot_cno = option;
                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
+                case Opt_norecovery:
+                        nilfs_set_opt(sbi, NORECOVERY);
+                        break;
                default:
                        printk(KERN_ERR
                               "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -672,9 +639,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
        /* nilfs->sem must be locked by the caller. */
-        if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+        if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
-                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-        } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
                printk(KERN_WARNING
                       "NILFS warning: mounting fs with errors\n");
 #if 0
@@ -782,11 +747,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_root = NULL;
        sb->s_time_gran = 1;
-        if (!nilfs_loaded(nilfs)) {
+        err = load_nilfs(nilfs, sbi);
-                err = load_nilfs(nilfs, sbi);
+        if (err)
-                if (err)
+                goto failed_sbi;
-                        goto failed_sbi;
-        }
        cno = nilfs_last_cno(nilfs);
        if (sb->s_flags & MS_RDONLY) {
@@ -854,12 +818,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                up_write(&nilfs->ns_sem);
        }
-        err = nilfs_mark_recovery_complete(sbi);
-        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: recovery failed.\n");
-                goto failed_root;
-        }
        down_write(&nilfs->ns_super_sem);
        if (!nilfs_test_opt(sbi, SNAPSHOT))
                nilfs->ns_current = sbi;
@@ -867,10 +825,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        return 0;
- failed_root:
-        dput(sb->s_root);
-        sb->s_root = NULL;
 failed_segctor:
        nilfs_detach_segment_constructor(sbi);
@@ -915,6 +869,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
+        if (!nilfs_valid_fs(nilfs)) {
+                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                       "remount because the filesystem is in an "
+                       "incomplete recovery state.\n", sb->s_id);
+                err = -EINVAL;
+                goto restore_opts;
+        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                goto out;
        if (*flags & MS_RDONLY) {
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad391a8c3e7e..6241e1722efc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs)
        might_sleep();
        if (nilfs_loaded(nilfs)) {
-                nilfs_mdt_clear(nilfs->ns_sufile);
                nilfs_mdt_destroy(nilfs->ns_sufile);
-                nilfs_mdt_clear(nilfs->ns_cpfile);
                nilfs_mdt_destroy(nilfs->ns_cpfile);
-                nilfs_mdt_clear(nilfs->ns_dat);
                nilfs_mdt_destroy(nilfs->ns_dat);
-                /* XXX: how and when to clear nilfs->ns_gc_dat? */
                nilfs_mdt_destroy(nilfs->ns_gc_dat);
        }
        if (nilfs_init(nilfs)) {
@@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs)
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
-        static struct lock_class_key dat_lock_key;
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        inode_size = nilfs->ns_inode_size;
        err = -ENOMEM;
-        nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+        nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
        if (unlikely(!nilfs->ns_dat))
                goto failed;
-        nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+        nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
        if (unlikely(!nilfs->ns_gc_dat))
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
+        nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
        if (unlikely(!nilfs->ns_cpfile))
                goto failed_gc_dat;
-        nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
+        nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
-        err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
-        if (unlikely(err))
-                goto failed_sufile;
-        err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
-        if (unlikely(err))
-                goto failed_sufile;
-        lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
-        lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
        nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-        nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
-                                 sizeof(struct nilfs_cpfile_header));
-        nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
-                                 sizeof(struct nilfs_sufile_header));
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
-                nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+                             NILFS_SR_DAT_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
-                nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+                                NILFS_SR_CPFILE_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
-        err = nilfs_mdt_read_inode_direct(
+        err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
-                nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+                                NILFS_SR_SUFILE_OFFSET(inode_size));
        if (unlikely(err))
                goto failed_sufile;
@@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        struct nilfs_recovery_info ri;
        unsigned int s_flags = sbi->s_super->s_flags;
        int really_read_only = bdev_read_only(nilfs->ns_bdev);
-        unsigned valid_fs;
+        int valid_fs = nilfs_valid_fs(nilfs);
-        int err = 0;
+        int err;
-        nilfs_init_recovery_info(&ri);
-        down_write(&nilfs->ns_sem);
+        if (nilfs_loaded(nilfs)) {
-        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+                if (valid_fs ||
-        up_write(&nilfs->ns_sem);
+                    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
+                        return 0;
+                printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
+                       "recovery state.\n");
+                return -EINVAL;
+        }
-        if (!valid_fs && (s_flags & MS_RDONLY)) {
+        if (!valid_fs) {
-                printk(KERN_INFO "NILFS: INFO: recovery "
+                printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-                       "required for readonly filesystem.\n");
+                if (s_flags & MS_RDONLY) {
-                if (really_read_only) {
+                        printk(KERN_INFO "NILFS: INFO: recovery "
-                        printk(KERN_ERR "NILFS: write access "
+                               "required for readonly filesystem.\n");
-                               "unavailable, cannot proceed.\n");
+                        printk(KERN_INFO "NILFS: write access will "
-                        err = -EROFS;
+                               "be enabled during recovery.\n");
-                        goto failed;
                }
-                printk(KERN_INFO "NILFS: write access will "
-                       "be enabled during recovery.\n");
-                sbi->s_super->s_flags &= ~MS_RDONLY;
        }
+        nilfs_init_recovery_info(&ri);
        err = nilfs_search_super_root(nilfs, sbi, &ri);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto failed;
        }
-        if (!valid_fs) {
+        if (valid_fs)
-                err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+                goto skip_recovery;
-                if (unlikely(err)) {
-                        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        if (s_flags & MS_RDONLY) {
-                        nilfs_mdt_destroy(nilfs->ns_sufile);
+                if (nilfs_test_opt(sbi, NORECOVERY)) {
-                        nilfs_mdt_destroy(nilfs->ns_dat);
+                        printk(KERN_INFO "NILFS: norecovery option specified. "
-                        goto failed;
+                               "skipping roll-forward recovery\n");
+                        goto skip_recovery;
                }
-                if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+                if (really_read_only) {
-                        sbi->s_super->s_dirt = 1;
+                        printk(KERN_ERR "NILFS: write access "
+                               "unavailable, cannot proceed.\n");
+                        err = -EROFS;
+                        goto failed_unload;
+                }
+                sbi->s_super->s_flags &= ~MS_RDONLY;
+        } else if (nilfs_test_opt(sbi, NORECOVERY)) {
+                printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+                       "option was specified for a read/write mount\n");
+                err = -EINVAL;
+                goto failed_unload;
        }
+        err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+        if (err)
+                goto failed_unload;
+        down_write(&nilfs->ns_sem);
+        nilfs->ns_mount_state |= NILFS_VALID_FS;
+        nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+        err = nilfs_commit_super(sbi, 1);
+        up_write(&nilfs->ns_sem);
+        if (err) {
+                printk(KERN_ERR "NILFS: failed to update super block. "
+                       "recovery unfinished.\n");
+                goto failed_unload;
+        }
+        printk(KERN_INFO "NILFS: recovery complete.\n");
+ skip_recovery:
        set_nilfs_loaded(nilfs);
+        nilfs_clear_recovery_info(&ri);
+        sbi->s_super->s_flags = s_flags;
+        return 0;
+ failed_unload:
+        nilfs_mdt_destroy(nilfs->ns_cpfile);
+        nilfs_mdt_destroy(nilfs->ns_sufile);
+        nilfs_mdt_destroy(nilfs->ns_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
@@ -632,30 +650,23 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
        struct inode *dat = nilfs_dat_inode(nilfs);
        unsigned long ncleansegs;
-        int err;
        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
-        err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
-        if (likely(!err))
+        *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
-                *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+        return 0;
-        return err;
 }
 int nilfs_near_disk_full(struct the_nilfs *nilfs)
 {
-        struct inode *sufile = nilfs->ns_sufile;
        unsigned long ncleansegs, nincsegs;
-        int ret;
-        ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
-        if (likely(!ret)) {
+        nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
-                nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+                nilfs->ns_blocks_per_segment + 1;
-                        nilfs->ns_blocks_per_segment + 1;
-                if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+        return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
-                        ret++;
-        }
-        return ret;
 }
 /**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 20abd55881e0..589786e33464 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -258,6 +258,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
                kfree(sbi);
 }
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+        unsigned valid_fs;
+        down_read(&nilfs->ns_sem);
+        valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+        up_read(&nilfs->ns_sem);
+        return valid_fs;
+}
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dcd2040d330c..5ef5f365a5c8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -69,36 +69,30 @@ static int zero;
 ctl_table inotify_table[] = {
        {
-                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
                .procname       = "max_user_instances",
                .data           = &inotify_max_user_instances,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
                .procname       = "max_user_watches",
                .data           = &inotify_max_user_watches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
                .procname       = "max_queued_events",
                .data           = &inotify_max_queued_events,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero
        },
-        { .ctl_name = 0 }
+        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -747,10 +741,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        /* create/update an inode mark */
        ret = inotify_update_watch(group, inode, mask);
-        if (unlikely(ret))
-                goto path_put_and_out;
-path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..08f7530e9341 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -927,7 +927,7 @@ lock_retry_remap:
                return 0;
        ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-                        "EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+                        "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
        return err < 0 ? err : -EIO;
 read_err:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..43179ddd336f 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -399,7 +399,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * @cached_page: allocated but as yet unused page
 * @lru_pvec:   lru-buffering pagevec of caller
 *
- * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
 * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1281,7 @@ rl_not_mapped_enoent:
 /*
 * Copy as much as we can into the pages and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the pages
+ * were successfully copied.  If a fault is encountered then clear the pages
 * out to (ofs + bytes) and return the number of bytes which were copied.
 */
 static inline size_t ntfs_copy_from_user(struct page **pages,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
 * copy of the complete multi sector transfer deprotected page.  On failure,
 * *@wrp is undefined.
 *
- * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
 * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
 *
 * The following error codes are defined:
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 9ef85e628fe1..79a89184cb5e 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -36,12 +36,11 @@
 /* Definition of the ntfs sysctl. */
 static ctl_table ntfs_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,       /* Binary and text IDs. */
                .procname       = "ntfs-debug",
                .data           = &debug_msgs,          /* Data pointer and size. */
                .maxlen         = sizeof(debug_msgs),
                .mode           = 0644,                 /* Mode, proc handler. */
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec
        },
        {}
 };
@@ -49,7 +48,6 @@ static ctl_table ntfs_sysctls[] = {
 /* Define the parent directory /proc/sys/fs. */
 static ctl_table sysctls_root[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = ntfs_sysctls
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..7c7198a5bc90 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2398,7 +2398,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
 *
 * The array is assumed to be large enough to hold an entire path (tree depth).
 *
- * Upon succesful return from this function:
+ * Upon successful return from this function:
 *
 * - The 'right_path' array will contain a path to the leaf block
 *   whose range contains e_cpos.
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
 * Calculate the bit offset in the hamming code buffer based on the bit's
 * offset in the data buffer.  Since the hamming code reserves all
 * power-of-two bits for parity, the data bit number and the code bit
- * number are offest by all the parity bits beforehand.
+ * number are offset by all the parity bits beforehand.
 *
 * Recall that bit numbers in hamming code are 1-based.  This function
 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                if (sc->sc_sock) {
                        inet = inet_sk(sc->sc_sock->sk);
                        /* the stack's structs aren't sparse endian clean */
-                        saddr = (__force __be32)inet->saddr;
+                        saddr = (__force __be32)inet->inet_saddr;
-                        daddr = (__force __be32)inet->daddr;
+                        daddr = (__force __be32)inet->inet_daddr;
-                        sport = (__force __be16)inet->sport;
+                        sport = (__force __be16)inet->inet_sport;
-                        dport = (__force __be16)inet->dport;
+                        dport = (__force __be16)inet->inet_dport;
                }
                /* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..03ccf9a7b1f4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2586,7 +2586,7 @@ fail:
         * is complete everywhere.  if the target dies while this is
         * going on, some nodes could potentially see the target as the
         * master, so it is important that my recovery finds the migration
-         * mle and sets the master to UNKNONWN. */
+         * mle and sets the master to UNKNOWN. */
        /* wait for new node to assert master */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..c5e4a49e3a12 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1855,7 +1855,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * outstanding lock request, so a cancel convert is
                 * required. We intentionally overwrite 'ret' - if the
                 * cancel fails and the lock was granted, it's easier
-                 * to just bubble sucess back up to the user.
+                 * to just bubble success back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
        } else if (!ret && (level > lockres->l_level)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..bf34c491ae96 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
        default:
                status = -EINVAL;
-                mlog(ML_ERROR, "Uknown access type!\n");
+                mlog(ML_ERROR, "Unknown access type!\n");
        }
        if (!status && ocfs2_meta_ecc(osb) && triggers)
                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..30967e3f5e43 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2431,7 +2431,7 @@ out:
 * we gonna touch and whether we need to create new blocks.
 *
 * Normally the refcount blocks store these refcount should be
- * continguous also, so that we can get the number easily.
+ * contiguous also, so that we can get the number easily.
 * As for meta_ac, we will at most add split 2 refcount record and
 * 2 more refcount block, so just check it in a rough way.
 *
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3f2f1c45b7b6..f3df0baa9a48 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -620,51 +620,46 @@ error:
 static ctl_table ocfs2_nm_table[] = {
        {
-                .ctl_name       = 1,
                .procname       = "hb_ctl_path",
                .data           = ocfs2_hb_ctl_path,
                .maxlen         = OCFS2_MAX_HB_CTL_PATH,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
-        { .ctl_name = 0 }
+        { }
 };
 static ctl_table ocfs2_mod_table[] = {
        {
-                .ctl_name       = FS_OCFS2_NM,
                .procname       = "nm",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_nm_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_kern_table[] = {
        {
-                .ctl_name       = FS_OCFS2,
                .procname       = "ocfs2",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_mod_table
        },
-        { .ctl_name = 0}
+        { }
 };
 static ctl_table ocfs2_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .data           = NULL,
                .maxlen         = 0,
                .mode           = 0555,
                .child          = ocfs2_kern_table
        },
-        { .ctl_name = 0 }
+        { }
 };
 static struct ctl_table_header *ocfs2_table_header = NULL;
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
 }
 /*
- * Tries to allocate exactly one block.  Returns true if sucessful.
+ * Tries to allocate exactly one block.  Returns true if successful.
 */
 int omfs_allocate_block(struct super_block *sb, u64 block)
 {
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c6..b4b31d277f3a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        error = -EPERM;
        if (!capable(CAP_SYS_CHROOT))
                goto dput_and_out;
+        error = security_path_chroot(&path);
+        if (error)
+                goto dput_and_out;
        set_fs_root(current->fs, &path);
        error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
+        err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+        if (err)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        err = notify_change(dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(file->f_path.mnt);
 out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
        if (error)
                goto dput_and_out;
        mutex_lock(&inode->i_mutex);
+        error = security_path_chmod(path.dentry, path.mnt, mode);
+        if (error)
+                goto out_unlock;
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path.dentry, &newattrs);
+out_unlock:
        mutex_unlock(&inode->i_mutex);
        mnt_drop_write(path.mnt);
 dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
        return sys_fchmodat(AT_FDCWD, filename, mode);
 }
-static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
+static int chown_common(struct path *path, uid_t user, gid_t group)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = path->dentry->d_inode;
        int error;
        struct iattr newattrs;
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
        mutex_lock(&inode->i_mutex);
-        error = notify_change(dentry, &newattrs);
+        error = security_path_chown(path, user, group);
+        if (!error)
+                error = notify_change(path->dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
        return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(path.dentry, user, group);
+        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
 out_release:
        path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
                goto out_fput;
        dentry = file->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = chown_common(dentry, user, group);
+        error = chown_common(&file->f_path, user, group);
        mnt_drop_write(file->f_path.mnt);
 out_fput:
        fput(file);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..64bc8998ac9a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
+ssize_t part_discard_alignment_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%u\n", p->discard_alignment);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+                   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_start.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
+        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+        p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+                                                              start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 038a6022152f..49cfd5f54238 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -1,7 +1,9 @@
 /************************************************************
 * EFI GUID Partition Table handling
- * Per Intel EFI Specification v1.02
+ *
- * http://developer.intel.com/technology/efi/efi.htm
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
 *   Copyright 2000,2001,2002,2004 Dell Inc.
 *
@@ -92,6 +94,7 @@
 *
 ************************************************************/
 #include <linux/crc32.h>
+#include <linux/math64.h>
 #include "check.h"
 #include "efi.h"
@@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
-        return (bdev->bd_inode->i_size >> 9) - 1ULL;
+        return div_u64(bdev->bd_inode->i_size,
+                       bdev_logical_block_size(bdev)) - 1ULL;
 }
 static inline int
@@ -188,6 +192,7 @@ static size_t
 read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
        if (!bdev || !buffer || lba > last_lba(bdev))
                return 0;
@@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, lba++, &sect);
+                unsigned char *data = read_dev_sector(bdev, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -257,15 +262,16 @@ static gpt_header *
 alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 {
        gpt_header *gpt;
+        unsigned ssz = bdev_logical_block_size(bdev);
        if (!bdev)
                return NULL;
-        gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
+        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt,
+        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
-                     sizeof (gpt_header)) < sizeof (gpt_header)) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
+        unsigned ssz = bdev_logical_block_size(bdev) / 512;
        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+                u64 start = le64_to_cpu(ptes[i].starting_lba);
+                u64 size = le64_to_cpu(ptes[i].ending_lba) -
+                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
                        continue;
-                put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
+                put_partition(state, i+1, start * ssz, size * ssz);
-                                 (le64_to_cpu(ptes[i].ending_lba) -
-                                  le64_to_cpu(ptes[i].starting_lba) +
-                                  1ULL));
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 2cc89d0475bf..6998b589abf9 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -37,7 +37,6 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
-#define GPT_BLOCK_SIZE 512
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@@ -79,7 +78,12 @@ typedef struct _gpt_header {
        __le32 num_partition_entries;
        __le32 sizeof_partition_entry;
        __le32 partition_entry_array_crc32;
-        u8 reserved2[GPT_BLOCK_SIZE - 92];
+        /* The rest of the logical block is reserved by UEFI and must be zero.
+         * EFI standard handles this by:
+         *
+         * uint8_t              reserved2[ BlockSize - 92 ];
+         */
 } __attribute__ ((packed)) gpt_header;
 typedef struct _gpt_entry_attributes {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 822c2d506518..4badde179b18 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,6 +410,16 @@ static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
 }
 #endif          /* CONFIG_MMU */
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+        seq_printf(m, "Cpus_allowed:\t");
+        seq_cpumask(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_cpumask_list(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+}
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
@@ -424,6 +434,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        }
        task_sig(m, task);
        task_cap(m, task);
+        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
        task_show_regs(m, task);
@@ -495,20 +506,17 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                /* add up live thread stats at the group level */
                if (whole) {
-                        struct task_cputime cputime;
                        struct task_struct *t = task;
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                                gtime = cputime_add(gtime, task_gtime(t));
+                                gtime = cputime_add(gtime, t->gtime);
                                t = next_thread(t);
                        } while (t != task);
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        thread_group_cputime(task, &cputime);
+                        thread_group_times(task, &utime, &stime);
-                        utime = cputime.utime;
-                        stime = cputime.stime;
                        gtime = cputime_add(gtime, sig->gtime);
                }
@@ -524,9 +532,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                utime = task_utime(task);
+                task_times(task, &utime, &stime);
-                stime = task_stime(task);
+                gtime = task->gtime;
-                gtime = task_gtime(task);
        }
        /* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
        int len;
-        for ( ; p->ctl_name || p->procname; p++) {
+        for ( ; p->procname; p++) {
                if (!p->procname)
                        continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
                void *dirent, filldir_t filldir)
 {
-        for (; table->ctl_name || table->procname; table++, (*pos)++) {
+        for (; table->procname; table++, (*pos)++) {
                int res;
                /* Can't do anything without a proc name */
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
        int i, j;
        unsigned long jif;
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-        cputime64_t guest;
+        cputime64_t guest, guest_nice;
        u64 sum = 0;
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
-        guest = cputime64_zero;
+        guest = guest_nice = cputime64_zero;
        getboottime(&boottime);
        jif = boottime.tv_sec;
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                guest_nice = cputime64_add(guest_nice,
+                        kstat_cpu(i).cpustat.guest_nice);
                for_each_irq_nr(j) {
                        sum += kstat_irqs_cpu(j, i);
                }
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
        }
        sum += arch_irq_stat();
-        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                "%llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
                (unsigned long long)cputime64_to_clock_t(nice),
                (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(irq),
                (unsigned long long)cputime64_to_clock_t(softirq),
                (unsigned long long)cputime64_to_clock_t(steal),
-                (unsigned long long)cputime64_to_clock_t(guest));
+                (unsigned long long)cputime64_to_clock_t(guest),
+                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = kstat_cpu(i).cpustat.softirq;
                steal = kstat_cpu(i).cpustat.steal;
                guest = kstat_cpu(i).cpustat.guest;
+                guest_nice = kstat_cpu(i).cpustat.guest_nice;
                seq_printf(p,
-                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                        "%llu\n",
                        i,
                        (unsigned long long)cputime64_to_clock_t(user),
                        (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
                        (unsigned long long)cputime64_to_clock_t(irq),
                        (unsigned long long)cputime64_to_clock_t(softirq),
                        (unsigned long long)cputime64_to_clock_t(steal),
-                        (unsigned long long)cputime64_to_clock_t(guest));
+                        (unsigned long long)cputime64_to_clock_t(guest),
+                        (unsigned long long)cputime64_to_clock_t(guest_nice));
        }
        seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d567..32f5d131a644 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -67,7 +67,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
        while (total < size) {
                if ((bh = sb_bread(sb, start + offset)) == NULL) {
-                        printk("qnx4: I/O error in counting free blocks\n");
+                        printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
                        break;
                }
                count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb1398..6f30c3d5bcbf 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int ix, ino;
        int size;
-        QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
+        QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-        QNX4DEBUG(("filp->f_pos         = %ld\n", (long) filp->f_pos));
+        QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
        lock_kernel();
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                        size = QNX4_NAME_MAX;
                                if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-                                        QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname));
+                                        QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
                                        if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
                                                ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
                                        else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c4..449f5a66dd34 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -107,7 +107,7 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
 {
        unsigned long phys;
-        QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
+        QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
        phys = qnx4_block_map( inode, iblock );
        if ( phys ) {
@@ -142,12 +142,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                                // read next xtnt block.
                                bh = sb_bread(inode->i_sb, i_xblk - 1);
                                if ( !bh ) {
-                                        QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
+                                        QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
                                        return -EIO;
                                }
                                xblk = (struct qnx4_xblk*)bh->b_data;
                                if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
-                                        QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
+                                        QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
                                        return -EIO;
                                }
                        }
@@ -168,7 +168,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
                        brelse( bh );
        }
-        QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
+        QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
        return block;
 }
@@ -209,7 +209,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
                return "no qnx4 filesystem (no root dir).";
        } else {
-                QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id));
+                QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
                rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
                rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
                for (j = 0; j < rl; j++) {
@@ -220,7 +220,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
                        for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
                                rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
                                if (rootdir->di_fname != NULL) {
-                                        QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname));
+                                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
                                        if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
                                                found = 1;
                                                qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
@@ -265,12 +265,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
           if we don't belong here... */
        bh = sb_bread(s, 1);
        if (!bh) {
-                printk("qnx4: unable to read the superblock\n");
+                printk(KERN_ERR "qnx4: unable to read the superblock\n");
                goto outnobh;
        }
        if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
                if (!silent)
-                        printk("qnx4: wrong fsid in superblock.\n");
+                        printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
                goto out;
        }
        s->s_op = &qnx4_sops;
@@ -284,14 +284,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        errmsg = qnx4_checkroot(s);
        if (errmsg != NULL) {
                if (!silent)
-                        printk("qnx4: %s\n", errmsg);
+                        printk(KERN_ERR "qnx4: %s\n", errmsg);
                goto out;
        }
        /* does root not have inode number QNX4_ROOT_INO ?? */
        root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
        if (IS_ERR(root)) {
-                printk("qnx4: get inode failed\n");
+                printk(KERN_ERR "qnx4: get inode failed\n");
                ret = PTR_ERR(root);
                goto out;
        }
@@ -374,7 +374,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        qnx4_inode = qnx4_raw_inode(inode);
        inode->i_mode = 0;
-        QNX4DEBUG(("Reading inode : [%d]\n", ino));
+        QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
        if (!ino) {
                printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
                                "out of range\n",
@@ -385,7 +385,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
        block = ino / QNX4_INODES_PER_BLOCK;
        if (!(bh = sb_bread(sb, block))) {
-                printk("qnx4: major problem: unable to read inode from dev "
+                printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
                       "%s\n", sb->s_id);
                iget_failed(inode);
                return ERR_PTR(-EIO);
@@ -499,7 +499,7 @@ static int __init init_qnx4_fs(void)
                return err;
        }
-        printk("QNX4 filesystem 0.2.3 registered.\n");
+        printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
        return 0;
 }
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd6..58703ebba879 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
        int namelen, thislen;
        if (bh == NULL) {
-                printk("qnx4: matching unassigned buffer !\n");
+                printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
                return 0;
        }
        de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
        *res_dir = NULL;
        if (!dir->i_sb) {
-                printk("qnx4: no superblock on dir.\n");
+                printk(KERN_WARNING "qnx4: no superblock on dir.\n");
                return NULL;
        }
        bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
        foundinode = qnx4_iget(dir->i_sb, ino);
        if (IS_ERR(foundinode)) {
                unlock_kernel();
-                QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n",
+                QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
                           PTR_ERR(foundinode)));
                return ERR_CAST(foundinode);
        }
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..353e78a9ebee 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
 config QUOTA_NETLINK_INTERFACE
        bool "Report quota messages through netlink interface"
-        depends on QUOTA && NET
+        depends on QUOTACTL && NET
        help
          If you say Y here, quota warnings (about exceeding softlimit, reaching
          hardlimit, etc.) will be reported through netlink interface. If unsure,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..eb5a755718f6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
 #include <asm/uaccess.h>
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-        .id = GENL_ID_GENERATE,
-        .hdrsize = 0,
-        .name = "VFS_DQUOT",
-        .version = 1,
-        .maxattr = QUOTA_NL_A_MAX,
-};
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
-        static atomic_t seq;
-        struct sk_buff *skb;
-        void *msg_head;
-        int ret;
-        int msg_size = 4 * nla_total_size(sizeof(u32)) +
-                       2 * nla_total_size(sizeof(u64));
-        /* We have to allocate using GFP_NOFS as we are called from a
-         * filesystem performing write and thus further recursion into
-         * the fs to free some data could cause deadlocks. */
-        skb = genlmsg_new(msg_size, GFP_NOFS);
-        if (!skb) {
-                printk(KERN_ERR
-                  "VFS: Not enough memory to send quota warning.\n");
-                return;
-        }
-        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
-        if (!msg_head) {
-                printk(KERN_ERR
-                  "VFS: Cannot store netlink header in quota warning.\n");
-                goto err_out;
-        }
-        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
-                MAJOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
-                MINOR(dquot->dq_sb->s_dev));
-        if (ret)
-                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-        if (ret)
-                goto attr_err_out;
-        genlmsg_end(skb, msg_head);
-        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-        return;
-attr_err_out:
-        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-        kfree_skb(skb);
-}
-#endif
 /*
 * Write warnings to the console and send warning messages over netlink.
 *
@@ -1145,18 +1074,20 @@ err_out:
 */
 static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
+        struct dquot *dq;
        int i;
-        for (i = 0; i < MAXQUOTAS; i++)
+        for (i = 0; i < MAXQUOTAS; i++) {
-                if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
+                dq = dquots[i];
-                    !warning_issued(dquots[i], warntype[i])) {
+                if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+                    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-                        print_warning(dquots[i], warntype[i]);
+                        print_warning(dq, warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-                        send_warning(dquots[i], warntype[i]);
 #endif
+                        quota_send_warning(dq->dq_type, dq->dq_id,
+                                           dq->dq_sb->s_dev, warntype[i]);
                }
+        }
 }
 static int ignore_hardlimit(struct dquot *dquot)
@@ -2473,100 +2404,89 @@ const struct quotactl_ops vfs_quotactl_ops = {
 static ctl_table fs_dqstats_table[] = {
        {
-                .ctl_name       = FS_DQ_LOOKUPS,
                .procname       = "lookups",
                .data           = &dqstats.lookups,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_DROPS,
                .procname       = "drops",
                .data           = &dqstats.drops,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_READS,
                .procname       = "reads",
                .data           = &dqstats.reads,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_WRITES,
                .procname       = "writes",
                .data           = &dqstats.writes,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_CACHE_HITS,
                .procname       = "cache_hits",
                .data           = &dqstats.cache_hits,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_ALLOCATED,
                .procname       = "allocated_dquots",
                .data           = &dqstats.allocated_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_FREE,
                .procname       = "free_dquots",
                .data           = &dqstats.free_dquots,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_DQ_SYNCS,
                .procname       = "syncs",
                .data           = &dqstats.syncs,
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
-                .ctl_name       = FS_DQ_WARNINGS,
                .procname       = "warnings",
                .data           = &flag_print_warnings,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table fs_table[] = {
        {
-                .ctl_name       = FS_DQSTATS,
                .procname       = "quota",
                .mode           = 0555,
                .child          = fs_dqstats_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static ctl_table sys_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = fs_table,
        },
-        { .ctl_name = 0 },
+        { },
 };
 static int __init dquot_init(void)
@@ -2607,12 +2527,6 @@ static int __init dquot_init(void)
        register_shrinker(&dqcache_shrinker);
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-        if (genl_register_family(&quota_genl_family) != 0)
-                printk(KERN_ERR
-                       "VFS: Failed to create quota netlink interface.\n");
-#endif
        return 0;
 }
 module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..ee91e2756950 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
 /* Check validity of generic quotactl commands */
 static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
        return ret;
 }
 #endif
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+        .id = GENL_ID_GENERATE,
+        .hdrsize = 0,
+        .name = "VFS_DQUOT",
+        .version = 1,
+        .maxattr = QUOTA_NL_A_MAX,
+};
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+                        const char warntype)
+{
+        static atomic_t seq;
+        struct sk_buff *skb;
+        void *msg_head;
+        int ret;
+        int msg_size = 4 * nla_total_size(sizeof(u32)) +
+                       2 * nla_total_size(sizeof(u64));
+        /* We have to allocate using GFP_NOFS as we are called from a
+         * filesystem performing write and thus further recursion into
+         * the fs to free some data could cause deadlocks. */
+        skb = genlmsg_new(msg_size, GFP_NOFS);
+        if (!skb) {
+                printk(KERN_ERR
+                  "VFS: Not enough memory to send quota warning.\n");
+                return;
+        }
+        msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+                        &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+        if (!msg_head) {
+                printk(KERN_ERR
+                  "VFS: Cannot store netlink header in quota warning.\n");
+                goto err_out;
+        }
+        ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+        if (ret)
+                goto attr_err_out;
+        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+        if (ret)
+                goto attr_err_out;
+        genlmsg_end(skb, msg_head);
+        genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+        return;
+attr_err_out:
+        printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+        kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+static int __init quota_init(void)
+{
+        if (genl_register_family(&quota_genl_family) != 0)
+                printk(KERN_ERR
+                       "VFS: Failed to create quota netlink interface.\n");
+        return 0;
+};
+module_init(quota_init);
+#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac28987f22a..b7f4a1f94d48 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        if (!(out_file->f_mode & FMODE_WRITE))
                goto fput_out;
        retval = -EINVAL;
-        if (!out_file->f_op || !out_file->f_op->sendpage)
-                goto fput_out;
        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..6a9e30c041dd 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                 item_ops.o ioctl.o procfs.o xattr.o
+                 item_ops.o ioctl.o procfs.o xattr.o lock.o
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..685495707181 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+        reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
+        reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
+                        reiserfs_write_unlock(sb);
                        __wait_on_buffer(bh);
+                        reiserfs_write_lock(sb);
                }
                BUG_ON(!buffer_uptodate(bh));
                BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..c094f58c7448 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
                                // user space buffer is swapped out. At that time
                                // entry can move to somewhere else
                                memcpy(local_buf, d_name, d_reclen);
+                                /*
+                                 * Since filldir might sleep, we can release
+                                 * the write lock here for other waiters
+                                 */
+                                reiserfs_write_unlock(inode->i_sb);
                                if (filldir
                                    (dirent, local_buf, d_reclen, d_off, d_ino,
                                     DT_UNKNOWN) < 0) {
+                                        reiserfs_write_lock(inode->i_sb);
                                        if (local_buf != small_buf) {
                                                kfree(local_buf);
                                        }
                                        goto end;
                                }
+                                reiserfs_write_lock(inode->i_sb);
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa5..60c080440661 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#ifdef CONFIG_REISERFS_CHECK
-struct tree_balance *cur_tb = NULL;     /* detects whether more than one
-                                           copy of tb exists as a means
-                                           of checking whether schedule
-                                           is interrupting do_balance */
-#endif
 static inline void buffer_info_init_left(struct tree_balance *tb,
                                         struct buffer_info *bi)
 {
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
 {
        int retval = 0;
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
                               "occurred based on cur_tb not being null at "
                               "this point in code. do_balance cannot properly "
-                               "handle schedule occurring while it runs.");
+                               "handle concurrent tree accesses on a same "
+                               "mount point.");
        }
        /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
             "check");*/
        RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-        cur_tb = tb;
+        REISERFS_SB(tb->tb_sb)->cur_tb = tb;
 #endif
 }
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
 #ifdef CONFIG_REISERFS_CHECK
        check_leaf_level(tb);
        check_internal_levels(tb);
-        cur_tb = NULL;
+        REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
 #endif
        /* reiserfs_free_block is no longer schedule safe.  So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f8..da2dba082e2d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 const struct file_operations reiserfs_file_operations = {
        .read = do_sync_read,
        .write = reiserfs_file_write,
-        .ioctl = reiserfs_ioctl,
+        .unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..6591cb21edf6 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
        return needed_nodes;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Set parameters for balancing.
 * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +831,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
                RFALSE(buffer_dirty(new_bh) ||
                       buffer_journaled(new_bh) ||
                       buffer_journal_dirty(new_bh),
-                       "PAP-8140: journlaled or dirty buffer %b for the new block",
+                       "PAP-8140: journaled or dirty buffer %b for the new block",
                       new_bh);
                /* Put empty buffers into the array. */
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
        /* Check whether the common parent is locked. */
        if (buffer_locked(*pcom_father)) {
+                /* Release the write lock while the buffer is busy */
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(*pcom_father);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb)) {
                        brelse(*pcom_father);
                        return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
                return REPEAT_SEARCH;
        if (buffer_locked(bh)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(bh);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
                                                                       FL[h]);
                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
                child_position =
                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
+                reiserfs_write_unlock(sb);
                bh = sb_bread(sb, son_number);
+                reiserfs_write_lock(sb);
                if (!bh)
                        return IO_ERROR;
                if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
                                    REPEAT_SEARCH : CARRY_ON;
                        }
 #endif
+                        reiserfs_write_unlock(tb->tb_sb);
                        __wait_on_buffer(locked);
+                        reiserfs_write_lock(tb->tb_sb);
                        if (FILESYSTEM_CHANGED_TB(tb))
                                return REPEAT_SEARCH;
                }
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
        /* if it possible in indirect_to_direct conversion */
        if (buffer_locked(tbS0)) {
+                reiserfs_write_unlock(tb->tb_sb);
                __wait_on_buffer(tbS0);
+                reiserfs_write_lock(tb->tb_sb);
                if (FILESYSTEM_CHANGED_TB(tb))
                        return REPEAT_SEARCH;
        }
 #ifdef CONFIG_REISERFS_CHECK
-        if (cur_tb) {
+        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
                print_cur_tb("fix_nodes");
                reiserfs_panic(tb->tb_sb, "PAP-8305",
                               "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..3a28e7751b3c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        struct cpu_key key;
        struct buffer_head *bh;
        struct item_head *ih, tmp_ih;
-        int fs_gen;
        b_blocknr_t blocknr;
        char *p = NULL;
        int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
                     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
                     3);
-      research:
        result = search_for_position_by_key(inode->i_sb, &key, &path);
        if (result != POSITION_FOUND) {
                pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
        }
        // read file tail into part of page
        offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
-        fs_gen = get_generation(inode->i_sb);
        copy_item_head(&tmp_ih, ih);
        /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
         ** sure we need to.  But, this means the item might move if
         ** kmap schedules
         */
-        if (!p) {
+        if (!p)
                p = (char *)kmap(bh_result->b_page);
-                if (fs_changed(fs_gen, inode->i_sb)
-                    && item_moved(&tmp_ih, &path)) {
-                        goto research;
-                }
-        }
        p += offset;
        memset(p, 0, inode->i_sb->s_blocksize);
        do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
           disappeared */
        if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
                int err;
-                lock_kernel();
+                reiserfs_write_lock(inode->i_sb);
                err = reiserfs_commit_for_inode(inode);
                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-                unlock_kernel();
+                reiserfs_write_unlock(inode->i_sb);
                if (err < 0)
                        ret = err;
        }
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        __le32 *item;
        int done;
        int fs_gen;
+        int lock_depth;
        struct reiserfs_transaction_handle *th = NULL;
        /* space reserved in transaction batch:
           . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
        loff_t new_offset =
            (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-        /* bad.... */
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
        version = get_inode_item_key_version(inode);
        if (!file_capable(inode, block)) {
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return -EFBIG;
        }
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                /* find number of block-th logical block of the file */
                ret = _get_block_create_0(inode, block, bh_result,
                                          create | GET_BLOCK_READ_DIRECT);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                return ret;
        }
        /*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                if (!dangle && th)
                        retval = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
                /* the item was found, so new blocks were not added to the file
                 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (blocks_needed == 1) {
                                un = &unf_single;
                        } else {
-                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);      // We need to avoid scheduling.
+                                un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
                                if (!un) {
                                        un = &unf_single;
                                        blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        if (retval)
                                goto failure;
                }
-                /* inserting indirect pointers for a hole can take a
+                /*
-                 ** long time.  reschedule if needed
+                 * inserting indirect pointers for a hole can take a
+                 * long time.  reschedule if needed and also release the write
+                 * lock for others.
                 */
-                cond_resched();
+                if (need_resched()) {
+                        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+                        schedule();
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                }
                retval = search_for_position_by_key(inode->i_sb, &key, &path);
                if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
                        retval = err;
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        reiserfs_check_path(&path);
        return retval;
 }
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
        int error;
        struct buffer_head *bh = NULL;
        int err2;
+        int lock_depth;
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        if (inode->i_size > 0) {
                error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return 0;
      out:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
        }
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        return error;
 }
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
        int ret;
        int old_ref = 0;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        fix_tail_page_for_writing(page);
        if (reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th;
        unsigned start;
+        int lock_depth = 0;
+        bool locked = false;
        if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
                pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** to do the i_size updates here.
         */
        pos += copied;
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
+                lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                locked = true;
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
+                if (!locked) {
+                        lock_depth = reiserfs_write_lock_once(inode->i_sb);
+                        locked = true;
+                }
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
      out:
+        if (locked)
+                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
        return ret == 0 ? copied : ret;
      journal_error:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+        locked = false;
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        goto out;
 }
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
        int update_sd = 0;
        struct reiserfs_transaction_handle *th = NULL;
+        reiserfs_write_unlock(inode->i_sb);
        reiserfs_wait_on_write_block(inode->i_sb);
+        reiserfs_write_lock(inode->i_sb);
        if (reiserfs_transaction_running(inode->i_sb)) {
                th = current->journal_info;
        }
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
         */
        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
-                reiserfs_write_lock(inode->i_sb);
                /* If the file have grown beyond the border where it
                   can have a tail, unmark it as needing a tail
                   packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
                ret = journal_begin(&myth, inode->i_sb, 1);
-                if (ret) {
+                if (ret)
-                        reiserfs_write_unlock(inode->i_sb);
                        goto journal_error;
-                }
                reiserfs_update_inode_transaction(inode);
                inode->i_size = pos;
                /*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
                reiserfs_update_sd(&myth, inode);
                update_sd = 1;
                ret = journal_end(&myth, inode->i_sb, 1);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto journal_error;
        }
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        mark_inode_dirty(inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
                if (ret)
                        goto out;
        }
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
      journal_error:
        if (th) {
-                reiserfs_write_lock(inode->i_sb);
                if (!update_sd)
                        reiserfs_update_sd(th, inode);
                ret = reiserfs_end_persistent_transaction(th);
-                reiserfs_write_unlock(inode->i_sb);
        }
        return ret;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..ace77451ceb1 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
 #include <linux/compat.h>
 /*
-** reiserfs_ioctl - handler for ioctl for inode
+ * reiserfs_ioctl - handler for ioctl for inode
-** supported commands:
+ * supported commands:
-**  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+ *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
-**                           and prevent packing file (argument arg has to be non-zero)
+ *                           and prevent packing file (argument arg has to be non-zero)
-**  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+ *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
-**  3) That's all for a while ...
+ *  3) That's all for a while ...
-*/
+ */
-int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                   unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int err = 0;
+        reiserfs_write_lock(inode->i_sb);
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
                if (S_ISREG(inode->i_mode)) {
                        if (arg)
-                                return reiserfs_unpack(inode, filp);
+                                err = reiserfs_unpack(inode, filp);
-                        else
-                                return 0;
                } else
-                        return -ENOTTY;
+                        err = -ENOTTY;
-                /* following two cases are taken from fs/ext2/ioctl.c by Remy
+                break;
-                   Card (card@masi.ibp.fr) */
+                /*
+                 * following two cases are taken from fs/ext2/ioctl.c by Remy
+                 * Card (card@masi.ibp.fr)
+                 */
        case REISERFS_IOC_GETFLAGS:
-                if (!reiserfs_attrs(inode->i_sb))
+                if (!reiserfs_attrs(inode->i_sb)) {
-                        return -ENOTTY;
+                        err = -ENOTTY;
+                        break;
+                }
                flags = REISERFS_I(inode)->i_attrs;
                i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
-                return put_user(flags, (int __user *)arg);
+                err = put_user(flags, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETFLAGS:{
-                        if (!reiserfs_attrs(inode->i_sb))
+                        if (!reiserfs_attrs(inode->i_sb)) {
-                                return -ENOTTY;
+                                err = -ENOTTY;
+                                break;
+                        }
                        err = mnt_want_write(filp->f_path.mnt);
                        if (err)
-                                return err;
+                                break;
                        if (!is_owner_or_cap(inode)) {
                                err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        mark_inode_dirty(inode);
 setflags_out:
                        mnt_drop_write(filp->f_path.mnt);
-                        return err;
+                        break;
                }
        case REISERFS_IOC_GETVERSION:
-                return put_user(inode->i_generation, (int __user *)arg);
+                err = put_user(inode->i_generation, (int __user *)arg);
+                break;
        case REISERFS_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
-                        return -EPERM;
+                        err = -EPERM;
+                        break;
                err = mnt_want_write(filp->f_path.mnt);
                if (err)
-                        return err;
+                        break;
                if (get_user(inode->i_generation, (int __user *)arg)) {
                        err = -EFAULT;
                        goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
                mark_inode_dirty(inode);
 setversion_out:
                mnt_drop_write(filp->f_path.mnt);
-                return err;
+                break;
        default:
-                return -ENOTTY;
+                err = -ENOTTY;
        }
+        reiserfs_write_unlock(inode->i_sb);
+        return err;
 }
 #ifdef CONFIG_COMPAT
 long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        /* These are just misnamed, they actually get/put from/to user an int */
        switch (cmd) {
        case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
        default:
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
-        ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+        return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..2f8a7e7b8dab 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
        clear_buffer_journal_restore_dirty(bh);
 }
-/* utility function to force a BUG if it is called without the big
-** kernel lock held.  caller is the string printed just before calling BUG()
-*/
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-#ifdef CONFIG_SMP
-        if (current->lock_depth < 0) {
-                reiserfs_panic(sb, "journal-1", "%s called without kernel "
-                               "lock held", caller);
-        }
-#else
-        ;
-#endif
-}
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
                                                                  super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *sb)
 {
        PROC_INFO_INC(sb, journal.lock_journal);
-        mutex_lock(&SB_JOURNAL(sb)->j_mutex);
+        reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
 }
 /* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
                disable_barrier(s);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
+                reiserfs_write_unlock(s);
                sync_dirty_buffer(bh);
+                reiserfs_write_lock(s);
        }
 }
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
 {
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
-        if (atomic_read(&j->j_async_throttle))
+        if (atomic_read(&j->j_async_throttle)) {
+                reiserfs_write_unlock(s);
                congestion_wait(BLK_RW_ASYNC, HZ / 10);
+                reiserfs_write_lock(s);
+        }
        return 0;
 }
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
        }
        /* make sure nobody is trying to flush this one at the same time */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
        if (!journal_list_still_alive(s, trans_id)) {
                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
        if (!list_empty(&jl->j_bh_list)) {
                int ret;
-                unlock_kernel();
+                /*
+                 * We might sleep in numerous places inside
+                 * write_ordered_buffers. Relax the write lock.
+                 */
+                reiserfs_write_unlock(s);
                ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                            journal, jl, &jl->j_bh_list);
                if (ret < 0 && retval == 0)
                        retval = ret;
-                lock_kernel();
+                reiserfs_write_lock(s);
        }
        BUG_ON(!list_empty(&jl->j_bh_list));
        /*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (tbh) {
-                        if (buffer_dirty(tbh))
+                        if (buffer_dirty(tbh)) {
-                            ll_rw_block(WRITE, 1, &tbh) ;
+                            reiserfs_write_unlock(s);
+                            ll_rw_block(WRITE, 1, &tbh);
+                            reiserfs_write_lock(s);
+                        }
                        put_bh(tbh) ;
                }
        }
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
                bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
                    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
+                reiserfs_write_unlock(s);
                wait_on_buffer(tbh);
+                reiserfs_write_lock(s);
                // since we're using ll_rw_blk above, it might have skipped over
                // a locked buffer.  Double check here
                //
-                if (buffer_dirty(tbh))  /* redundant, sync_dirty_buffer() checks */
+                /* redundant, sync_dirty_buffer() checks */
+                if (buffer_dirty(tbh)) {
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(tbh);
+                        reiserfs_write_lock(s);
+                }
                if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
                        reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
                        if (buffer_dirty(jl->j_commit_bh))
                                BUG();
                        mark_buffer_dirty(jl->j_commit_bh) ;
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(jl->j_commit_bh) ;
+                        reiserfs_write_lock(s);
                }
-        } else
+        } else {
+                reiserfs_write_unlock(s);
                wait_on_buffer(jl->j_commit_bh);
+                reiserfs_write_lock(s);
+        }
        check_barrier_completion(s, jl->j_commit_bh);
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
        if (trans_id >= journal->j_last_flush_trans_id) {
                if (buffer_locked((journal->j_header_bh))) {
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer((journal->j_header_bh));
+                        reiserfs_write_lock(sb);
                        if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
                                reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
                                disable_barrier(sb);
                                goto sync;
                        }
+                        reiserfs_write_unlock(sb);
                        wait_on_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                        check_barrier_completion(sb, journal->j_header_bh);
                } else {
                      sync:
                        set_buffer_dirty(journal->j_header_bh);
+                        reiserfs_write_unlock(sb);
                        sync_dirty_buffer(journal->j_header_bh);
+                        reiserfs_write_lock(sb);
                }
                if (!buffer_uptodate(journal->j_header_bh)) {
                        reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-                mutex_lock(&journal->j_flush_mutex);
+                reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
                                        reiserfs_panic(s, "journal-1011",
                                                       "cn->bh is NULL");
                                }
+                                reiserfs_write_unlock(s);
                                wait_on_buffer(cn->bh);
+                                reiserfs_write_lock(s);
                                if (!cn->bh) {
                                        reiserfs_panic(s, "journal-1012",
                                                       "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
-        mutex_lock(&journal->j_flush_mutex);
+        reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
        reiserfs_mounted_fs_count--;
        /* wait for all commits to finish */
        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
+        /*
+         * We must release the write lock here because
+         * the workqueue job (flush_async_commit) needs this lock
+         */
+        reiserfs_write_unlock(sb);
        flush_workqueue(commit_wq);
        if (!reiserfs_mounted_fs_count) {
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
        }
+        reiserfs_write_lock(sb);
        free_journal_ram(sb);
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
        /* read in the log blocks, memcpy to the corresponding real block */
        ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
        for (i = 0; i < get_desc_trans_len(desc); i++) {
+                reiserfs_write_unlock(sb);
                wait_on_buffer(log_blocks[i]);
+                reiserfs_write_lock(sb);
                if (!buffer_uptodate(log_blocks[i])) {
                        reiserfs_warning(sb, "journal-1212",
                                         "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
+        /*
+         * We need to unlock here to avoid creating the following
+         * dependency:
+         * reiserfs_lock -> sysfs_mutex
+         * Because the reiserfs mmap path creates the following dependency:
+         * mm->mmap -> reiserfs_lock, hence we have
+         * mm->mmap -> reiserfs_lock ->sysfs_mutex
+         * This would ends up in a circular dependency with sysfs readdir path
+         * which does sysfs_mutex -> mm->mmap_sem
+         * This is fine because the reiserfs lock is useless in mount path,
+         * at least until we call journal_begin. We keep it for paranoid
+         * reasons.
+         */
+        reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
+                reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
+        reiserfs_write_lock(sb);
        rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        }
        reiserfs_mounted_fs_count++;
-        if (reiserfs_mounted_fs_count <= 1)
+        if (reiserfs_mounted_fs_count <= 1) {
+                reiserfs_write_unlock(sb);
                commit_wq = create_workqueue("reiserfs");
+                reiserfs_write_lock(sb);
+        }
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&journal->j_join_wait, &wait);
        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+        if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
+                reiserfs_write_unlock(s);
                schedule();
+                reiserfs_write_lock(s);
+        }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&journal->j_join_wait, &wait);
 }
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
        struct reiserfs_journal *journal = SB_JOURNAL(sb);
        unsigned long bcount = journal->j_bcount;
        while (1) {
+                reiserfs_write_unlock(sb);
                schedule_timeout_uninterruptible(1);
+                reiserfs_write_lock(sb);
                journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
                while ((atomic_read(&journal->j_wcount) > 0 ||
                        atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
        if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
                unlock_journal(sb);
+                reiserfs_write_unlock(sb);
                reiserfs_wait_on_write_block(sb);
+                reiserfs_write_lock(sb);
                PROC_INFO_INC(sb, journal.journal_relock_writers);
                goto relock;
        }
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
        struct reiserfs_journal_list *jl;
        struct list_head *entry;
-        lock_kernel();
+        reiserfs_write_lock(sb);
        if (!list_empty(&journal->j_journal_list)) {
                /* last entry is the youngest, commit it and you get everything */
                entry = journal->j_journal_list.prev;
                jl = JOURNAL_LIST_ENTRY(entry);
                flush_commit_list(sb, jl, 1);
        }
-        unlock_kernel();
+        reiserfs_write_unlock(sb);
 }
 /*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-        mutex_lock(&jl->j_commit_mutex);
+        reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                next = cn->next;
                free_cnode(sb, cn);
                cn = next;
+                reiserfs_write_unlock(sb);
                cond_resched();
+                reiserfs_write_lock(sb);
        }
        /* we are done  with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * is lost.
         */
        if (!list_empty(&jl->j_tail_bh_list)) {
-                unlock_kernel();
+                reiserfs_write_unlock(sb);
                write_ordered_buffers(&journal->j_dirty_buffers_lock,
                                      journal, jl, &jl->j_tail_bh_list);
-                lock_kernel();
+                reiserfs_write_lock(sb);
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
        mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..ee2cfc0fd8a7
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,88 @@
+#include <linux/reiserfs_fs.h>
+#include <linux/mutex.h>
+/*
+ * The previous reiserfs locking scheme was heavily based on
+ * the tricky properties of the Bkl:
+ *
+ * - it was acquired recursively by a same task
+ * - the performances relied on the release-while-schedule() property
+ *
+ * Now that we replace it by a mutex, we still want to keep the same
+ * recursive property to avoid big changes in the code structure.
+ * We use our own lock_owner here because the owner field on a mutex
+ * is only available in SMP or mutex debugging, also we only need this field
+ * for this mutex, no need for a system wide mutex facility.
+ *
+ * Also this lock is often released before a call that could block because
+ * reiserfs performances were partialy based on the release while schedule()
+ * property of the Bkl.
+ */
+void reiserfs_write_lock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+        }
+        /* No need to protect it, only the current task touches it */
+        sb_i->lock_depth++;
+}
+void reiserfs_write_unlock(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        /*
+         * Are we unlocking without even holding the lock?
+         * Such a situation must raise a BUG() if we don't want
+         * to corrupt the data.
+         */
+        BUG_ON(sb_i->lock_owner != current);
+        if (--sb_i->lock_depth == -1) {
+                sb_i->lock_owner = NULL;
+                mutex_unlock(&sb_i->lock);
+        }
+}
+/*
+ * If we already own the lock, just exit and don't increase the depth.
+ * Useful when we don't want to lock more than once.
+ *
+ * We always return the lock_depth we had before calling
+ * this function.
+ */
+int reiserfs_write_lock_once(struct super_block *s)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+        if (sb_i->lock_owner != current) {
+                mutex_lock(&sb_i->lock);
+                sb_i->lock_owner = current;
+                return sb_i->lock_depth++;
+        }
+        return sb_i->lock_depth;
+}
+void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+{
+        if (lock_depth == -1)
+                reiserfs_write_unlock(s);
+}
+/*
+ * Utility function to force a BUG if it is called without the superblock
+ * write lock held.  caller is the string printed just before calling BUG()
+ */
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+        if (sb_i->lock_depth < 0)
+                reiserfs_panic(sb, "%s called without kernel lock held %d",
+                               caller);
+}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 271579128634..e296ff72a6cc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nd)
 {
        int retval;
+        int lock_depth;
        struct inode *inode = NULL;
        struct reiserfs_dir_entry de;
        INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
        if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
                return ERR_PTR(-ENAMETOOLONG);
-        reiserfs_write_lock(dir->i_sb);
+        /*
+         * Might be called with or without the write lock, must be careful
+         * to not recursively hold it in case we want to release the lock
+         * before rescheduling.
+         */
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        de.de_gen_number_bit_string = NULL;
        retval =
            reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                inode = reiserfs_iget(dir->i_sb,
                                      (struct cpu_key *)&(de.de_dir_id));
                if (!inode || IS_ERR(inode)) {
-                        reiserfs_write_unlock(dir->i_sb);
+                        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
                        return ERR_PTR(-EACCES);
                }
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
                if (IS_PRIVATE(dir))
                        inode->i_flags |= S_PRIVATE;
        }
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        if (retval == IO_ERROR) {
                return ERR_PTR(-EIO);
        }
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct reiserfs_transaction_handle th;
        struct reiserfs_security_handle security;
+        int lock_depth;
        /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
        int jbegin_count =
            JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return retval;
        }
        jbegin_count += retval;
-        reiserfs_write_lock(dir->i_sb);
+        lock_depth = reiserfs_write_lock_once(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
-      out_failed:
+out_failed:
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, lock_depth);
        return retval;
 }
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb710..adbc6f538515 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
   .  */
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 void __reiserfs_panic(struct super_block *sb, const char *id,
                      const char *function, const char *fmt, ...)
 {
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
+                        reiserfs_write_unlock(s);
                        sync_dirty_buffer(bh);
+                        reiserfs_write_lock(s);
                        // update bitmap_info stuff
                        bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
                        brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..5fa7118f04e1 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key,	/* Key to search for. */
        return ITEM_NOT_FOUND;
 }
-#ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance *cur_tb;
-#endif
 /* Minimal possible key. It is never in the tree. */
 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
 #define SEARCH_BY_KEY_READA 16
-/* The function is NOT SCHEDULE-SAFE! */
+/*
-static void search_by_key_reada(struct super_block *s,
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+        bool unlocked = false;
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
        }
+        /*
+         * We are going to read some blocks on which we
+         * have a reference. It's safe, though we might be
+         * reading blocks concurrently changed if we release
+         * the lock. But it's still fine because we check later
+         * if the tree changed
+         */
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-                if (!buffer_uptodate(bh[j]))
+                if (!buffer_uptodate(bh[j])) {
+                        if (!unlocked) {
+                                reiserfs_write_unlock(s);
+                                unlocked = true;
+                        }
                        ll_rw_block(READA, 1, bh + j);
+                }
                brelse(bh[j]);
        }
+        return unlocked;
 }
 /**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                        bool unlocked = false;
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                                search_by_key_reada(sb, reada_bh,
+                                /* may unlock the write lock */
+                                unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
+                        /*
+                         * If we haven't already unlocked the write lock,
+                         * then we need to do that here before reading
+                         * the current block
+                         */
+                        if (!buffer_uptodate(bh) && !unlocked) {
+                                reiserfs_write_unlock(sb);
+                                unlocked = true;
+                        }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
+                        if (unlocked)
+                                reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,	/* Key to s
                       !key_in_buffer(search_path, key, sb),
                       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-                if (cur_tb) {
+                if (REISERFS_SB(sb)->cur_tb) {
                        print_cur_tb("5140");
                        reiserfs_panic(sb, "PAP-5140",
                                       "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
                        reiserfs_free_block(th, inode, block, 1);
                    }
+                    reiserfs_write_unlock(sb);
                    cond_resched();
+                    reiserfs_write_lock(sb);
                    if (item_moved (&s_ih, path))  {
                        need_re_search = 1;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f38022..339b0baf2af6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
-        lock_kernel();
+        reiserfs_write_lock(s);
        if (s->s_dirt)
                reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
        reiserfs_proc_info_done(s);
+        reiserfs_write_unlock(s);
+        mutex_destroy(&REISERFS_SB(s)->lock);
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
        struct reiserfs_transaction_handle th;
        int err = 0;
+        int lock_depth;
        if (inode->i_sb->s_flags & MS_RDONLY) {
                reiserfs_warning(inode->i_sb, "clm-6006",
                                 "writing inode %lu on readonly FS",
                                 inode->i_ino);
                return;
        }
-        reiserfs_write_lock(inode->i_sb);
+        lock_depth = reiserfs_write_lock_once(inode->i_sb);
        /* this is really only used for atime updates, so they don't have
         ** to be included in O_SYNC or fsync
         */
        err = journal_begin(&th, inode->i_sb, 1);
-        if (err) {
+        if (err)
-                reiserfs_write_unlock(inode->i_sb);
+                goto out;
-                return;
-        }
        reiserfs_update_sd(&th, inode);
        journal_end(&th, inode->i_sb, 1);
-        reiserfs_write_unlock(inode->i_sb);
+out:
+        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 #ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+#endif
+        reiserfs_write_lock(s);
+#ifdef CONFIG_QUOTA
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
-        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return 0;
 out_err:
        kfree(new_opts);
-        unlock_kernel();
+        reiserfs_write_unlock(s);
        return err;
 }
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+        reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
+        reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
        if (!sbi) {
                errval = -ENOMEM;
-                goto error;
+                goto error_alloc;
        }
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
+        mutex_init(&REISERFS_SB(s)->lock);
+        REISERFS_SB(s)->lock_depth = -1;
+        /*
+         * This function is called with the bkl, which also was the old
+         * locking used here.
+         * do_journal_begin() will soon check if we hold the lock (ie: was the
+         * bkl). This is likely because do_journal_begin() has several another
+         * callers because at this time, it doesn't seem to be necessary to
+         * protect against anything.
+         * Anyway, let's be conservative and lock for now.
+         */
+        reiserfs_write_lock(s);
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        init_waitqueue_head(&(sbi->s_wait));
        spin_lock_init(&sbi->bitmap_lock);
+        reiserfs_write_unlock(s);
        return (0);
 error:
+        reiserfs_write_unlock(s);
+error_alloc:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..58aa8e75f7f5 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
        /* If we don't have the privroot located yet - go find it */
-        mutex_lock(&s->s_root->d_inode->i_mutex);
+        reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
        if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
-                mutex_lock(&s->s_root->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
                err = create_privroot(REISERFS_SB(s)->priv_root);
                mutex_unlock(&s->s_root->d_inode->i_mutex);
        }
        if (privroot->d_inode) {
                s->s_xattr = reiserfs_xattr_handlers;
-                mutex_lock(&privroot->d_inode->i_mutex);
+                reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e17534..39208663aaf1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
        ret = buf->ops->confirm(pipe, buf);
        if (!ret) {
                more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+                if (file->f_op && file->f_op->sendpage)
-                ret = file->f_op->sendpage(file, buf->page, buf->offset,
+                        ret = file->f_op->sendpage(file, buf->page, buf->offset,
-                                           sd->len, &pos, more);
+                                                   sd->len, &pos, more);
+                else
+                        ret = -EINVAL;
        }
        return ret;
@@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        splice_write = out->f_op->splice_write;
+        if (out->f_op && out->f_op->splice_write)
-        if (!splice_write)
+                splice_write = out->f_op->splice_write;
+        else
                splice_write = default_file_splice_write;
        return splice_write(pipe, out, ppos, len, flags);
@@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        splice_read = in->f_op->splice_read;
+        if (in->f_op && in->f_op->splice_read)
-        if (!splice_read)
+                splice_read = in->f_op->splice_read;
+        else
                splice_read = default_file_splice_read;
        return splice_read(in, ppos, pipe, len, flags);
@@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
-                        if (out->f_op->llseek == no_llseek)
+                        if (!out->f_op || !out->f_op->llseek ||
+                            out->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
@@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
-                        if (in->f_op->llseek == no_llseek)
+                        if (!in->f_op || !in->f_op->llseek ||
+                            in->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e0201837d244..f05f2303a8b8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -25,7 +25,6 @@
 #include "sysfs.h"
 DEFINE_MUTEX(sysfs_mutex);
-DEFINE_MUTEX(sysfs_rename_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 static DEFINE_SPINLOCK(sysfs_ino_lock);
@@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 }
 /**
- *      sysfs_get_dentry - get dentry for the given sysfs_dirent
- *      @sd: sysfs_dirent of interest
- *
- *      Get dentry for @sd.  Dentry is looked up if currently not
- *      present.  This function descends from the root looking up
- *      dentry for each step.
- *
- *      LOCKING:
- *      mutex_lock(sysfs_rename_mutex)
- *
- *      RETURNS:
- *      Pointer to found dentry on success, ERR_PTR() value on error.
- */
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
-{
-        struct dentry *dentry = dget(sysfs_sb->s_root);
-        while (dentry->d_fsdata != sd) {
-                struct sysfs_dirent *cur;
-                struct dentry *parent;
-                /* find the first ancestor which hasn't been looked up */
-                cur = sd;
-                while (cur->s_parent != dentry->d_fsdata)
-                        cur = cur->s_parent;
-                /* look it up */
-                parent = dentry;
-                mutex_lock(&parent->d_inode->i_mutex);
-                dentry = lookup_one_noperm(cur->s_name, parent);
-                mutex_unlock(&parent->d_inode->i_mutex);
-                dput(parent);
-                if (IS_ERR(dentry))
-                        break;
-        }
-        return dentry;
-}
-/**
 *      sysfs_get_active - get an active reference to sysfs_dirent
 *      @sd: sysfs_dirent to get an active reference to
 *
@@ -298,7 +257,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
+static int sysfs_dentry_delete(struct dentry *dentry)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
+}
+static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        int is_dir;
+        mutex_lock(&sysfs_mutex);
+        /* The sysfs dirent has been deleted */
+        if (sd->s_flags & SYSFS_FLAG_REMOVED)
+                goto out_bad;
+        /* The sysfs dirent has been moved? */
+        if (dentry->d_parent->d_fsdata != sd->s_parent)
+                goto out_bad;
+        /* The sysfs dirent has been renamed */
+        if (strcmp(dentry->d_name.name, sd->s_name) != 0)
+                goto out_bad;
+        mutex_unlock(&sysfs_mutex);
+out_valid:
+        return 1;
+out_bad:
+        /* Remove the dentry from the dcache hashes.
+         * If this is a deleted dentry we use d_drop instead of d_delete
+         * so sysfs doesn't need to cope with negative dentries.
+         *
+         * If this is a dentry that has simply been renamed we
+         * use d_drop to remove it from the dcache lookup on its
+         * old parent.  If this dentry persists later when a lookup
+         * is performed at its new name the dentry will be readded
+         * to the dcache hashes.
+         */
+        is_dir = (sysfs_type(sd) == SYSFS_DIR);
+        mutex_unlock(&sysfs_mutex);
+        if (is_dir) {
+                /* If we have submounts we must allow the vfs caches
+                 * to lie about the state of the filesystem to prevent
+                 * leaks and other nasty things.
+                 */
+                if (have_submounts(dentry))
+                        goto out_valid;
+                shrink_dcache_parent(dentry);
+        }
+        d_drop(dentry);
+        return 0;
+}
+static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
        struct sysfs_dirent * sd = dentry->d_fsdata;
@@ -307,7 +320,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 }
 static const struct dentry_operations sysfs_dentry_ops = {
-        .d_iput         = sysfs_d_iput,
+        .d_revalidate   = sysfs_dentry_revalidate,
+        .d_delete       = sysfs_dentry_delete,
+        .d_iput         = sysfs_dentry_iput,
 };
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
@@ -344,12 +359,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
        return NULL;
 }
-static int sysfs_ilookup_test(struct inode *inode, void *arg)
-{
-        struct sysfs_dirent *sd = arg;
-        return inode->i_ino == sd->s_ino;
-}
 /**
 *      sysfs_addrm_start - prepare for sysfs_dirent add/remove
 *      @acxt: pointer to sysfs_addrm_cxt to be used
@@ -357,47 +366,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg)
 *
 *      This function is called when the caller is about to add or
 *      remove sysfs_dirent under @parent_sd.  This function acquires
- *      sysfs_mutex, grabs inode for @parent_sd if available and lock
+ *      sysfs_mutex.  @acxt is used to keep and pass context to
- *      i_mutex of it.  @acxt is used to keep and pass context to
 *      other addrm functions.
 *
 *      LOCKING:
 *      Kernel thread context (may sleep).  sysfs_mutex is locked on
- *      return.  i_mutex of parent inode is locked on return if
+ *      return.
- *      available.
 */
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
                       struct sysfs_dirent *parent_sd)
 {
-        struct inode *inode;
        memset(acxt, 0, sizeof(*acxt));
        acxt->parent_sd = parent_sd;
-        /* Lookup parent inode.  inode initialization is protected by
-         * sysfs_mutex, so inode existence can be determined by
-         * looking up inode while holding sysfs_mutex.
-         */
        mutex_lock(&sysfs_mutex);
-        inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
-                         parent_sd);
-        if (inode) {
-                WARN_ON(inode->i_state & I_NEW);
-                /* parent inode available */
-                acxt->parent_inode = inode;
-                /* sysfs_mutex is below i_mutex in lock hierarchy.
-                 * First, trylock i_mutex.  If fails, unlock
-                 * sysfs_mutex and lock them in order.
-                 */
-                if (!mutex_trylock(&inode->i_mutex)) {
-                        mutex_unlock(&sysfs_mutex);
-                        mutex_lock(&inode->i_mutex);
-                        mutex_lock(&sysfs_mutex);
-                }
-        }
 }
 /**
@@ -422,18 +404,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 */
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+        struct sysfs_inode_attrs *ps_iattr;
        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
                return -EEXIST;
        sd->s_parent = sysfs_get(acxt->parent_sd);
-        if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-                inc_nlink(acxt->parent_inode);
-        acxt->cnt++;
        sysfs_link_sibling(sd);
+        /* Update timestamps on the parent */
+        ps_iattr = acxt->parent_sd->s_iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
        return 0;
 }
@@ -512,70 +498,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 */
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+        struct sysfs_inode_attrs *ps_iattr;
        BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
        sysfs_unlink_sibling(sd);
+        /* Update timestamps on the parent */
+        ps_iattr = acxt->parent_sd->s_iattr;
+        if (ps_iattr) {
+                struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+        }
        sd->s_flags |= SYSFS_FLAG_REMOVED;
        sd->s_sibling = acxt->removed;
        acxt->removed = sd;
-        if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
-                drop_nlink(acxt->parent_inode);
-        acxt->cnt++;
-}
-/**
- *      sysfs_drop_dentry - drop dentry for the specified sysfs_dirent
- *      @sd: target sysfs_dirent
- *
- *      Drop dentry for @sd.  @sd must have been unlinked from its
- *      parent on entry to this function such that it can't be looked
- *      up anymore.
- */
-static void sysfs_drop_dentry(struct sysfs_dirent *sd)
-{
-        struct inode *inode;
-        struct dentry *dentry;
-        inode = ilookup(sysfs_sb, sd->s_ino);
-        if (!inode)
-                return;
-        /* Drop any existing dentries associated with sd.
-         *
-         * For the dentry to be properly freed we need to grab a
-         * reference to the dentry under the dcache lock,  unhash it,
-         * and then put it.  The playing with the dentry count allows
-         * dput to immediately free the dentry  if it is not in use.
-         */
-repeat:
-        spin_lock(&dcache_lock);
-        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                if (d_unhashed(dentry))
-                        continue;
-                dget_locked(dentry);
-                spin_lock(&dentry->d_lock);
-                __d_drop(dentry);
-                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
-                dput(dentry);
-                goto repeat;
-        }
-        spin_unlock(&dcache_lock);
-        /* adjust nlink and update timestamp */
-        mutex_lock(&inode->i_mutex);
-        inode->i_ctime = CURRENT_TIME;
-        drop_nlink(inode);
-        if (sysfs_type(sd) == SYSFS_DIR)
-                drop_nlink(inode);
-        mutex_unlock(&inode->i_mutex);
-        iput(inode);
 }
 /**
@@ -584,25 +522,15 @@ repeat:
 *
 *      Finish up sysfs_dirent add/remove.  Resources acquired by
 *      sysfs_addrm_start() are released and removed sysfs_dirents are
- *      cleaned up.  Timestamps on the parent inode are updated.
+ *      cleaned up.
 *
 *      LOCKING:
- *      All mutexes acquired by sysfs_addrm_start() are released.
+ *      sysfs_mutex is released.
 */
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 {
        /* release resources acquired by sysfs_addrm_start() */
        mutex_unlock(&sysfs_mutex);
-        if (acxt->parent_inode) {
-                struct inode *inode = acxt->parent_inode;
-                /* if added/removed, update timestamps on the parent */
-                if (acxt->cnt)
-                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-                mutex_unlock(&inode->i_mutex);
-                iput(inode);
-        }
        /* kill removed sysfs_dirents */
        while (acxt->removed) {
@@ -611,7 +539,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
                acxt->removed = sd->s_sibling;
                sd->s_sibling = NULL;
-                sysfs_drop_dentry(sd);
                sysfs_deactivate(sd);
                unmap_bin_file(sd);
                sysfs_put(sd);
@@ -751,10 +678,15 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        }
        /* instantiate and hash dentry */
-        dentry->d_op = &sysfs_dentry_ops;
+        ret = d_find_alias(inode);
-        dentry->d_fsdata = sysfs_get(sd);
+        if (!ret) {
-        d_instantiate(dentry, inode);
+                dentry->d_op = &sysfs_dentry_ops;
-        d_rehash(dentry);
+                dentry->d_fsdata = sysfs_get(sd);
+                d_add(dentry, inode);
+        } else {
+                d_move(ret, dentry);
+                iput(inode);
+        }
 out_unlock:
        mutex_unlock(&sysfs_mutex);
@@ -763,7 +695,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 const struct inode_operations sysfs_dir_inode_operations = {
        .lookup         = sysfs_lookup,
+        .permission     = sysfs_permission,
        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
        .setxattr       = sysfs_setxattr,
 };
@@ -826,141 +760,65 @@ void sysfs_remove_dir(struct kobject * kobj)
        __sysfs_remove_dir(sd);
 }
-int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+int sysfs_rename(struct sysfs_dirent *sd,
+        struct sysfs_dirent *new_parent_sd, const char *new_name)
 {
-        struct sysfs_dirent *sd = kobj->sd;
-        struct dentry *parent = NULL;
-        struct dentry *old_dentry = NULL, *new_dentry = NULL;
        const char *dup_name = NULL;
        int error;
-        mutex_lock(&sysfs_rename_mutex);
+        mutex_lock(&sysfs_mutex);
        error = 0;
-        if (strcmp(sd->s_name, new_name) == 0)
+        if ((sd->s_parent == new_parent_sd) &&
+            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
-        /* get the original dentry */
-        old_dentry = sysfs_get_dentry(sd);
-        if (IS_ERR(old_dentry)) {
-                error = PTR_ERR(old_dentry);
-                old_dentry = NULL;
-                goto out;
-        }
-        parent = old_dentry->d_parent;
-        /* lock parent and get dentry for new name */
-        mutex_lock(&parent->d_inode->i_mutex);
-        mutex_lock(&sysfs_mutex);
        error = -EEXIST;
-        if (sysfs_find_dirent(sd->s_parent, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_name))
-                goto out_unlock;
+                goto out;
-        error = -ENOMEM;
-        new_dentry = d_alloc_name(parent, new_name);
-        if (!new_dentry)
-                goto out_unlock;
        /* rename sysfs_dirent */
-        error = -ENOMEM;
+        if (strcmp(sd->s_name, new_name) != 0) {
-        new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+                error = -ENOMEM;
-        if (!new_name)
+                new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
-                goto out_unlock;
+                if (!new_name)
+                        goto out;
-        dup_name = sd->s_name;
-        sd->s_name = new_name;
+                dup_name = sd->s_name;
+                sd->s_name = new_name;
+        }
-        /* rename */
+        /* Remove from old parent's list and insert into new parent's list. */
-        d_add(new_dentry, NULL);
+        if (sd->s_parent != new_parent_sd) {
-        d_move(old_dentry, new_dentry);
+                sysfs_unlink_sibling(sd);
+                sysfs_get(new_parent_sd);
+                sysfs_put(sd->s_parent);
+                sd->s_parent = new_parent_sd;
+                sysfs_link_sibling(sd);
+        }
        error = 0;
- out_unlock:
+ out:
        mutex_unlock(&sysfs_mutex);
-        mutex_unlock(&parent->d_inode->i_mutex);
        kfree(dup_name);
-        dput(old_dentry);
-        dput(new_dentry);
- out:
-        mutex_unlock(&sysfs_rename_mutex);
        return error;
 }
+int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
+{
+        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+}
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
-        struct dentry *old_parent, *new_parent = NULL;
-        struct dentry *old_dentry = NULL, *new_dentry = NULL;
-        int error;
-        mutex_lock(&sysfs_rename_mutex);
        BUG_ON(!sd->s_parent);
-        new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        error = 0;
+        return sysfs_rename(sd, new_parent_sd, sd->s_name);
-        if (sd->s_parent == new_parent_sd)
-                goto out;       /* nothing to move */
-        /* get dentries */
-        old_dentry = sysfs_get_dentry(sd);
-        if (IS_ERR(old_dentry)) {
-                error = PTR_ERR(old_dentry);
-                old_dentry = NULL;
-                goto out;
-        }
-        old_parent = old_dentry->d_parent;
-        new_parent = sysfs_get_dentry(new_parent_sd);
-        if (IS_ERR(new_parent)) {
-                error = PTR_ERR(new_parent);
-                new_parent = NULL;
-                goto out;
-        }
-again:
-        mutex_lock(&old_parent->d_inode->i_mutex);
-        if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
-                mutex_unlock(&old_parent->d_inode->i_mutex);
-                goto again;
-        }
-        mutex_lock(&sysfs_mutex);
-        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, sd->s_name))
-                goto out_unlock;
-        error = -ENOMEM;
-        new_dentry = d_alloc_name(new_parent, sd->s_name);
-        if (!new_dentry)
-                goto out_unlock;
-        error = 0;
-        d_add(new_dentry, NULL);
-        d_move(old_dentry, new_dentry);
-        /* Remove from old parent's list and insert into new parent's list. */
-        sysfs_unlink_sibling(sd);
-        sysfs_get(new_parent_sd);
-        drop_nlink(old_parent->d_inode);
-        sysfs_put(sd->s_parent);
-        sd->s_parent = new_parent_sd;
-        inc_nlink(new_parent->d_inode);
-        sysfs_link_sibling(sd);
- out_unlock:
-        mutex_unlock(&sysfs_mutex);
-        mutex_unlock(&new_parent->d_inode->i_mutex);
-        mutex_unlock(&old_parent->d_inode->i_mutex);
- out:
-        dput(new_parent);
-        dput(old_dentry);
-        dput(new_dentry);
-        mutex_unlock(&sysfs_rename_mutex);
-        return error;
 }
 /* Relationship between s_mode and the DT_xxx types */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea4680f15f..dc30d9e31683 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -579,46 +579,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 */
 int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 {
-        struct sysfs_dirent *victim_sd = NULL;
+        struct sysfs_dirent *sd;
-        struct dentry *victim = NULL;
-        struct inode * inode;
        struct iattr newattrs;
        int rc;
-        rc = -ENOENT;
+        mutex_lock(&sysfs_mutex);
-        victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
-        if (!victim_sd)
-                goto out;
-        mutex_lock(&sysfs_rename_mutex);
+        rc = -ENOENT;
-        victim = sysfs_get_dentry(victim_sd);
+        sd = sysfs_find_dirent(kobj->sd, attr->name);
-        mutex_unlock(&sysfs_rename_mutex);
+        if (!sd)
-        if (IS_ERR(victim)) {
-                rc = PTR_ERR(victim);
-                victim = NULL;
                goto out;
-        }
-        inode = victim->d_inode;
-        mutex_lock(&inode->i_mutex);
-        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+        newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
-        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+        newattrs.ia_valid = ATTR_MODE;
-        newattrs.ia_ctime = current_fs_time(inode->i_sb);
+        rc = sysfs_sd_setattr(sd, &newattrs);
-        rc = sysfs_setattr(victim, &newattrs);
-        if (rc == 0) {
-                fsnotify_change(victim, newattrs.ia_valid);
-                mutex_lock(&sysfs_mutex);
-                victim_sd->s_mode = newattrs.ia_mode;
-                mutex_unlock(&sysfs_mutex);
-        }
-        mutex_unlock(&inode->i_mutex);
 out:
-        dput(victim);
+        mutex_unlock(&sysfs_mutex);
-        sysfs_put(victim_sd);
        return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e28cecf179f5..220b758523ae 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -37,7 +37,9 @@ static struct backing_dev_info sysfs_backing_dev_info = {
 };
 static const struct inode_operations sysfs_inode_operations ={
+        .permission     = sysfs_permission,
        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
        .setxattr       = sysfs_setxattr,
 };
@@ -46,7 +48,7 @@ int __init sysfs_inode_init(void)
        return bdi_init(&sysfs_backing_dev_info);
 }
-struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *attrs;
        struct iattr *iattrs;
@@ -64,30 +66,15 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
        return attrs;
 }
-int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
 {
-        struct inode * inode = dentry->d_inode;
-        struct sysfs_dirent * sd = dentry->d_fsdata;
        struct sysfs_inode_attrs *sd_attrs;
        struct iattr *iattrs;
        unsigned int ia_valid = iattr->ia_valid;
-        int error;
-        if (!sd)
-                return -EINVAL;
        sd_attrs = sd->s_iattr;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
-        error = inode_setattr(inode, iattr);
-        if (error)
-                return error;
        if (!sd_attrs) {
                /* setting attributes for the first time, allocate now */
                sd_attrs = sysfs_init_inode_attrs(sd);
@@ -103,42 +90,78 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
                if (ia_valid & ATTR_GID)
                        iattrs->ia_gid = iattr->ia_gid;
                if (ia_valid & ATTR_ATIME)
-                        iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+                        iattrs->ia_atime = iattr->ia_atime;
-                                        inode->i_sb->s_time_gran);
                if (ia_valid & ATTR_MTIME)
-                        iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+                        iattrs->ia_mtime = iattr->ia_mtime;
-                                        inode->i_sb->s_time_gran);
                if (ia_valid & ATTR_CTIME)
-                        iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+                        iattrs->ia_ctime = iattr->ia_ctime;
-                                        inode->i_sb->s_time_gran);
                if (ia_valid & ATTR_MODE) {
                        umode_t mode = iattr->ia_mode;
-                        if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                                mode &= ~S_ISGID;
                        iattrs->ia_mode = sd->s_mode = mode;
                }
        }
+        return 0;
+}
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        int error;
+        if (!sd)
+                return -EINVAL;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+        error = inode_setattr(inode, iattr);
+        if (error)
+                return error;
+        mutex_lock(&sysfs_mutex);
+        error = sysfs_sd_setattr(sd, iattr);
+        mutex_unlock(&sysfs_mutex);
        return error;
 }
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
+{
+        struct sysfs_inode_attrs *iattrs;
+        void *old_secdata;
+        size_t old_secdata_len;
+        iattrs = sd->s_iattr;
+        if (!iattrs)
+                iattrs = sysfs_init_inode_attrs(sd);
+        if (!iattrs)
+                return -ENOMEM;
+        old_secdata = iattrs->ia_secdata;
+        old_secdata_len = iattrs->ia_secdata_len;
+        iattrs->ia_secdata = *secdata;
+        iattrs->ia_secdata_len = *secdata_len;
+        *secdata = old_secdata;
+        *secdata_len = old_secdata_len;
+        return 0;
+}
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
-        struct sysfs_inode_attrs *iattrs;
        void *secdata;
        int error;
        u32 secdata_len = 0;
        if (!sd)
                return -EINVAL;
-        if (!sd->s_iattr)
-                sd->s_iattr = sysfs_init_inode_attrs(sd);
-        if (!sd->s_iattr)
-                return -ENOMEM;
-        iattrs = sd->s_iattr;
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
@@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                                                &secdata, &secdata_len);
                if (error)
                        goto out;
-                if (iattrs->ia_secdata)
-                        security_release_secctx(iattrs->ia_secdata,
-                                                iattrs->ia_secdata_len);
-                iattrs->ia_secdata = secdata;
-                iattrs->ia_secdata_len = secdata_len;
+                mutex_lock(&sysfs_mutex);
+                error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
+                mutex_unlock(&sysfs_mutex);
+                if (secdata)
+                        security_release_secctx(secdata, secdata_len);
        } else
                return -EINVAL;
 out:
@@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 {
-        inode->i_mode = iattr->ia_mode;
        inode->i_uid = iattr->ia_uid;
        inode->i_gid = iattr->ia_gid;
        inode->i_atime = iattr->ia_atime;
@@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
        inode->i_ctime = iattr->ia_ctime;
 }
-/*
- * sysfs has a different i_mutex lock order behavior for i_mutex than other
- * filesystems; sysfs i_mutex is called in many places with subsystem locks
- * held. At the same time, many of the VFS locking rules do not apply to
- * sysfs at all (cross directory rename for example). To untangle this mess
- * (which gives false positives in lockdep), we're giving sysfs inodes their
- * own class for i_mutex.
- */
-static struct lock_class_key sysfs_inode_imutex_key;
 static int sysfs_count_nlink(struct sysfs_dirent *sd)
 {
        struct sysfs_dirent *child;
@@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
        return nr + 2;
 }
+static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
+{
+        struct sysfs_inode_attrs *iattrs = sd->s_iattr;
+        inode->i_mode = sd->s_mode;
+        if (iattrs) {
+                /* sysfs_dirent has non-default attributes
+                 * get them from persistent copy in sysfs_dirent
+                 */
+                set_inode_attr(inode, &iattrs->ia_iattr);
+                security_inode_notifysecctx(inode,
+                                            iattrs->ia_secdata,
+                                            iattrs->ia_secdata_len);
+        }
+        if (sysfs_type(sd) == SYSFS_DIR)
+                inode->i_nlink = sysfs_count_nlink(sd);
+}
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct inode *inode = dentry->d_inode;
+        mutex_lock(&sysfs_mutex);
+        sysfs_refresh_inode(sd, inode);
+        mutex_unlock(&sysfs_mutex);
+        generic_fillattr(inode, stat);
+        return 0;
+}
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
-        struct sysfs_inode_attrs *iattrs;
        inode->i_private = sysfs_get(sd);
        inode->i_mapping->a_ops = &sysfs_aops;
        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
        inode->i_op = &sysfs_inode_operations;
-        inode->i_ino = sd->s_ino;
-        lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
-        iattrs = sd->s_iattr;
+        set_default_inode_attr(inode, sd->s_mode);
-        if (iattrs) {
+        sysfs_refresh_inode(sd, inode);
-                /* sysfs_dirent has non-default attributes
-                 * get them for the new inode from persistent copy
-                 * in sysfs_dirent
-                 */
-                set_inode_attr(inode, &iattrs->ia_iattr);
-                if (iattrs->ia_secdata)
-                        security_inode_notifysecctx(inode,
-                                                iattrs->ia_secdata,
-                                                iattrs->ia_secdata_len);
-        } else
-                set_default_inode_attr(inode, sd->s_mode);
        /* initialize inode according to type */
        switch (sysfs_type(sd)) {
        case SYSFS_DIR:
                inode->i_op = &sysfs_dir_inode_operations;
                inode->i_fop = &sysfs_dir_operations;
-                inode->i_nlink = sysfs_count_nlink(sd);
                break;
        case SYSFS_KOBJ_ATTR:
                inode->i_size = PAGE_SIZE;
@@ -315,3 +344,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        else
                return -ENOENT;
 }
+int sysfs_permission(struct inode *inode, int mask)
+{
+        struct sysfs_dirent *sd = inode->i_private;
+        mutex_lock(&sysfs_mutex);
+        sysfs_refresh_inode(sd, inode);
+        mutex_unlock(&sysfs_mutex);
+        return generic_permission(inode, mask, NULL);
+}
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5081ad77026..c5eff49fa41b 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -210,10 +210,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 const struct inode_operations sysfs_symlink_inode_operations = {
-        .setxattr = sysfs_setxattr,
+        .setxattr       = sysfs_setxattr,
-        .readlink = generic_readlink,
+        .readlink       = generic_readlink,
-        .follow_link = sysfs_follow_link,
+        .follow_link    = sysfs_follow_link,
-        .put_link = sysfs_put_link,
+        .put_link       = sysfs_put_link,
+        .setattr        = sysfs_setattr,
+        .getattr        = sysfs_getattr,
+        .permission     = sysfs_permission,
 };
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7482ac..ca52e7b9d8f8 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -89,9 +89,7 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 */
 struct sysfs_addrm_cxt {
        struct sysfs_dirent     *parent_sd;
-        struct inode            *parent_inode;
        struct sysfs_dirent     *removed;
-        int                     cnt;
 };
 /*
@@ -105,7 +103,6 @@ extern struct kmem_cache *sysfs_dir_cachep;
 * dir.c
 */
 extern struct mutex sysfs_mutex;
-extern struct mutex sysfs_rename_mutex;
 extern spinlock_t sysfs_assoc_lock;
 extern const struct file_operations sysfs_dir_operations;
@@ -133,6 +130,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd);
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
+int sysfs_rename(struct sysfs_dirent *sd,
+        struct sysfs_dirent *new_parent_sd, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
        if (sd) {
@@ -155,7 +155,10 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 */
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
+int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd946..8a771c59ac3e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2014,7 +2014,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                inum = key_inum_flash(c, &dent->key);
                fscki1 = read_add_inode(c, priv, inum);
                if (IS_ERR(fscki1)) {
-                        err = PTR_ERR(fscki);
+                        err = PTR_ERR(fscki1);
                        ubifs_err("error %d while processing entry node and "
                                  "trying to find parent inode node %lu",
                                  err, (unsigned long)inum);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d602..39849f887e72 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
        int err;
-        ssize_t ret;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                return err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (ret < 0)
-                return ret;
-        if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
-                err = ubifs_sync_wbufs_by_inode(c, inode);
-                if (err)
-                        return err;
-        }
-        return ret;
 }
 static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..868a55ee080f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
 /*
 * This file implements functions needed to recover from unclean un-mounts.
 * When UBIFS is mounted, it checks a flag on the master node to determine if
- * an un-mount was completed sucessfully. If not, the process of mounting
+ * an un-mount was completed successfully. If not, the process of mounting
 * incorparates additional checking and fixing of on-flash data structures.
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee987..943ad5624530 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1842,22 +1842,32 @@ const struct super_operations ubifs_super_operations = {
 * @name: UBI volume name
 * @mode: UBI volume open mode
 *
- * There are several ways to specify UBI volumes when mounting UBIFS:
+ * The primary method of mounting UBIFS is by specifying the UBI volume
- * o ubiX_Y    - UBI device number X, volume Y;
+ * character device node path. However, UBIFS may also be mounted withoug any
- * o ubiY      - UBI device number 0, volume Y;
+ * character device node using one of the following methods:
+ *
+ * o ubiX_Y    - mount UBI device number X, volume Y;
+ * o ubiY      - mount UBI device number 0, volume Y;
 * o ubiX:NAME - mount UBI device X, volume with name NAME;
 * o ubi:NAME  - mount UBI device 0, volume with name NAME.
 *
 * Alternative '!' separator may be used instead of ':' (because some shells
 * like busybox may interpret ':' as an NFS host name separator). This function
- * returns ubi volume object in case of success and a negative error code in
+ * returns UBI volume description object in case of success and a negative
- * case of failure.
+ * error code in case of failure.
 */
 static struct ubi_volume_desc *open_ubi(const char *name, int mode)
 {
+        struct ubi_volume_desc *ubi;
        int dev, vol;
        char *endptr;
+        /* First, try to open using the device node path method */
+        ubi = ubi_open_volume_path(name, mode);
+        if (!IS_ERR(ubi))
+                return ubi;
+        /* Try the "nodev" method */
        if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
                return ERR_PTR(-EINVAL);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee9..05ac0fe9c4d3 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
        if (count == 0)
                return NULL;
        
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d6fc4ef727bb..87813e405cef 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -864,16 +864,9 @@ xfs_convert_page(
        if (startio) {
                if (count) {
-                        struct backing_dev_info *bdi;
-                        bdi = inode->i_mapping->backing_dev_info;
                        wbc->nr_to_write--;
-                        if (bdi_write_congested(bdi)) {
+                        if (wbc->nr_to_write <= 0)
-                                wbc->encountered_congestion = 1;
-                                done = 1;
-                        } else if (wbc->nr_to_write <= 0) {
                                done = 1;
-                        }
                }
                xfs_start_page_writeback(page, !page_dirty, count);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index c5bc67c4e3bb..7bb5092d6ae4 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -55,170 +55,140 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.sgid_inherit.min,
                .extra2         = &xfs_params.sgid_inherit.max
        },
        {
-                .ctl_name       = XFS_SYMLINK_MODE,
                .procname       = "irix_symlink_mode",
                .data           = &xfs_params.symlink_mode.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.symlink_mode.min,
                .extra2         = &xfs_params.symlink_mode.max
        },
        {
-                .ctl_name       = XFS_PANIC_MASK,
                .procname       = "panic_mask",
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
        {
-                .ctl_name       = XFS_ERRLEVEL,
                .procname       = "error_level",
                .data           = &xfs_params.error_level.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.error_level.min,
                .extra2         = &xfs_params.error_level.max
        },
        {
-                .ctl_name       = XFS_SYNCD_TIMER,
                .procname       = "xfssyncd_centisecs",
                .data           = &xfs_params.syncd_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.syncd_timer.min,
                .extra2         = &xfs_params.syncd_timer.max
        },
        {
-                .ctl_name       = XFS_INHERIT_SYNC,
                .procname       = "inherit_sync",
                .data           = &xfs_params.inherit_sync.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_sync.min,
                .extra2         = &xfs_params.inherit_sync.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODUMP,
                .procname       = "inherit_nodump",
                .data           = &xfs_params.inherit_nodump.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodump.min,
                .extra2         = &xfs_params.inherit_nodump.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOATIME,
                .procname       = "inherit_noatime",
                .data           = &xfs_params.inherit_noatim.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_noatim.min,
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .ctl_name       = XFS_BUF_TIMER,
                .procname       = "xfsbufd_centisecs",
                .data           = &xfs_params.xfs_buf_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_timer.min,
                .extra2         = &xfs_params.xfs_buf_timer.max
        },
        {
-                .ctl_name       = XFS_BUF_AGE,
                .procname       = "age_buffer_centisecs",
                .data           = &xfs_params.xfs_buf_age.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.xfs_buf_age.min,
                .extra2         = &xfs_params.xfs_buf_age.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NOSYM,
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nosym.min,
                .extra2         = &xfs_params.inherit_nosym.max
        },
        {
-                .ctl_name       = XFS_ROTORSTEP,
                .procname       = "rotorstep",
                .data           = &xfs_params.rotorstep.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.rotorstep.min,
                .extra2         = &xfs_params.rotorstep.max
        },
        {
-                .ctl_name       = XFS_INHERIT_NODFRG,
                .procname       = "inherit_nodefrag",
                .data           = &xfs_params.inherit_nodfrg.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.inherit_nodfrg.min,
                .extra2         = &xfs_params.inherit_nodfrg.max
        },
        {
-                .ctl_name       = XFS_FILESTREAM_TIMER,
                .procname       = "filestream_centisecs",
                .data           = &xfs_params.fstrm_timer.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.fstrm_timer.min,
                .extra2         = &xfs_params.fstrm_timer.max,
        },
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {
-                .ctl_name       = XFS_STATS_CLEAR,
                .procname       = "stats_clear",
                .data           = &xfs_params.stats_clear.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &xfs_stats_clear_proc_handler,
+                .proc_handler   = xfs_stats_clear_proc_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &xfs_params.stats_clear.min,
                .extra2         = &xfs_params.stats_clear.max
        },
@@ -229,7 +199,6 @@ static ctl_table xfs_table[] = {
 static ctl_table xfs_dir_table[] = {
        {
-                .ctl_name       = FS_XFS,
                .procname       = "xfs",
                .mode           = 0555,
                .child          = xfs_table
@@ -239,7 +208,6 @@ static ctl_table xfs_dir_table[] = {
 static ctl_table xfs_root_table[] = {
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = xfs_dir_table
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a2c16bcee90b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -98,7 +98,7 @@ typedef struct xfs_dquot {
 #define dq_flags        q_lists.dqm_flags
 /*
- * Lock hierachy for q_qlock:
+ * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
 *      XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
 */